Commit 47bece87b14b866872b52ff04d469832e4936756
Committed by
Wim Van Sebroeck
1 parent
55e8ddecec
Exists in
master
and in
7 other branches
[WATCHDOG] hpwdt: Add NMI sourcing
Add NMI sourcing functionality (Can only be active if nmi_watchdog is inactive). Signed-off-by: Thomas Mingarelli <thomas.mingarelli@hp.com> Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Showing 2 changed files with 128 additions and 15 deletions Side-by-side Diff
Documentation/watchdog/hpwdt.txt
1 | +Last reviewed: 06/02/2009 | |
2 | + | |
3 | + HP iLO2 NMI Watchdog Driver | |
4 | + NMI sourcing for iLO2 based ProLiant Servers | |
5 | + Documentation and Driver by | |
6 | + Thomas Mingarelli <thomas.mingarelli@hp.com> | |
7 | + | |
8 | + The HP iLO2 NMI Watchdog driver is a kernel module that provides basic | |
9 | + watchdog functionality and the added benefit of NMI sourcing. Both the | |
10 | + watchdog functionality and the NMI sourcing capability need to be enabled | |
11 | + by the user. Remember that the two modes are not dependant on one another. | |
12 | + A user can have the NMI sourcing without the watchdog timer and vice-versa. | |
13 | + | |
14 | + Watchdog functionality is enabled like any other common watchdog driver. That | |
15 | + is, an application needs to be started that kicks off the watchdog timer. A | |
16 | + basic application exists in the Documentation/watchdog/src directory called | |
17 | + watchdog-test.c. Simply compile the C file and kick it off. If the system | |
18 | + gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will | |
19 | + not be updated in a timely fashion and a hardware system reset (also known as | |
20 | + an Automatic Server Recovery (ASR)) event will occur. | |
21 | + | |
22 | + The hpwdt driver also has three (3) module parameters. They are the following: | |
23 | + | |
24 | + soft_margin - allows the user to set the watchdog timer value | |
25 | + allow_kdump - allows the user to save off a kernel dump image after an NMI | |
26 | + nowayout - basic watchdog parameter that does not allow the timer to | |
27 | + be restarted or an impending ASR to be escaped. | |
28 | + | |
29 | + NOTE: More information about watchdog drivers in general, including the ioctl | |
30 | + interface to /dev/watchdog can be found in | |
31 | + Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt. | |
32 | + | |
33 | + The NMI sourcing capability is disabled when the driver discovers that the | |
34 | + nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to | |
35 | + distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the | |
36 | + Linux kernel. What this means is that the hpwdt nmi handler code is called | |
37 | + each time the NMI signal fires off. This could amount to several thousands of | |
38 | + NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and | |
39 | + confused" message in the logs or if the system gets into a hung state, then | |
40 | + the user should reboot with nmi_watchdog=0. | |
41 | + | |
42 | + 1. If the kernel has not been booted with nmi_watchdog turned off then | |
43 | + edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the | |
44 | + currently booting kernel line. | |
45 | + 2. reboot the sever | |
46 | + | |
47 | + Now, the hpwdt can successfully receive and source the NMI and provide a log | |
48 | + message that details the reason for the NMI (as determined by the HP BIOS). | |
49 | + | |
50 | + Below is a list of NMIs the HP BIOS understands along with the associated | |
51 | + code (reason): | |
52 | + | |
53 | + No source found 00h | |
54 | + | |
55 | + Uncorrectable Memory Error 01h | |
56 | + | |
57 | + ASR NMI 1Bh | |
58 | + | |
59 | + PCI Parity Error 20h | |
60 | + | |
61 | + NMI Button Press 27h | |
62 | + | |
63 | + SB_BUS_NMI 28h | |
64 | + | |
65 | + ILO Doorbell NMI 29h | |
66 | + | |
67 | + ILO IOP NMI 2Ah | |
68 | + | |
69 | + ILO Watchdog NMI 2Bh | |
70 | + | |
71 | + Proc Throt NMI 2Ch | |
72 | + | |
73 | + Front Side Bus NMI 2Dh | |
74 | + | |
75 | + PCI Express Error 2Fh | |
76 | + | |
77 | + DMA controller NMI 30h | |
78 | + | |
79 | + Hypertransport/CSI Error 31h | |
80 | + | |
81 | + | |
82 | + | |
83 | + -- Tom Mingarelli | |
84 | + (thomas.mingarelli@hp.com) |
drivers/watchdog/hpwdt.c
... | ... | @@ -19,6 +19,7 @@ |
19 | 19 | #include <linux/interrupt.h> |
20 | 20 | #include <linux/io.h> |
21 | 21 | #include <linux/irq.h> |
22 | +#include <linux/nmi.h> | |
22 | 23 | #include <linux/kernel.h> |
23 | 24 | #include <linux/miscdevice.h> |
24 | 25 | #include <linux/mm.h> |
... | ... | @@ -47,7 +48,7 @@ |
47 | 48 | #define PCI_BIOS32_PARAGRAPH_LEN 16 |
48 | 49 | #define PCI_ROM_BASE1 0x000F0000 |
49 | 50 | #define ROM_SIZE 0x10000 |
50 | -#define HPWDT_VERSION "1.01" | |
51 | +#define HPWDT_VERSION "1.1.1" | |
51 | 52 | |
52 | 53 | struct bios32_service_dir { |
53 | 54 | u32 signature; |
... | ... | @@ -119,6 +120,7 @@ |
119 | 120 | static char expect_release; |
120 | 121 | static unsigned long hpwdt_is_open; |
121 | 122 | static unsigned int allow_kdump; |
123 | +static int hpwdt_nmi_sourcing; | |
122 | 124 | |
123 | 125 | static void __iomem *pci_mem_addr; /* the PCI-memory address */ |
124 | 126 | static unsigned long __iomem *hpwdt_timer_reg; |
125 | 127 | |
... | ... | @@ -468,21 +470,22 @@ |
468 | 470 | if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI) |
469 | 471 | return NOTIFY_OK; |
470 | 472 | |
471 | - spin_lock_irqsave(&rom_lock, rom_pl); | |
472 | - if (!die_nmi_called) | |
473 | - asminline_call(&cmn_regs, cru_rom_addr); | |
474 | - die_nmi_called = 1; | |
475 | - spin_unlock_irqrestore(&rom_lock, rom_pl); | |
476 | - if (cmn_regs.u1.ral == 0) { | |
477 | - printk(KERN_WARNING "hpwdt: An NMI occurred, " | |
478 | - "but unable to determine source.\n"); | |
479 | - } else { | |
480 | - if (allow_kdump) | |
481 | - hpwdt_stop(); | |
482 | - panic("An NMI occurred, please see the Integrated " | |
483 | - "Management Log for details.\n"); | |
473 | + if (hpwdt_nmi_sourcing) { | |
474 | + spin_lock_irqsave(&rom_lock, rom_pl); | |
475 | + if (!die_nmi_called) | |
476 | + asminline_call(&cmn_regs, cru_rom_addr); | |
477 | + die_nmi_called = 1; | |
478 | + spin_unlock_irqrestore(&rom_lock, rom_pl); | |
479 | + if (cmn_regs.u1.ral == 0) { | |
480 | + printk(KERN_WARNING "hpwdt: An NMI occurred, " | |
481 | + "but unable to determine source.\n"); | |
482 | + } else { | |
483 | + if (allow_kdump) | |
484 | + hpwdt_stop(); | |
485 | + panic("An NMI occurred, please see the Integrated " | |
486 | + "Management Log for details.\n"); | |
487 | + } | |
484 | 488 | } |
485 | - | |
486 | 489 | return NOTIFY_OK; |
487 | 490 | } |
488 | 491 | |
489 | 492 | |
... | ... | @@ -627,10 +630,36 @@ |
627 | 630 | * Init & Exit |
628 | 631 | */ |
629 | 632 | |
633 | +#ifdef ARCH_HAS_NMI_WATCHDOG | |
634 | +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) | |
635 | +{ | |
636 | + /* | |
637 | + * If nmi_watchdog is turned off then we can turn on | |
638 | + * our nmi sourcing capability. | |
639 | + */ | |
640 | + if (!nmi_watchdog_active()) | |
641 | + hpwdt_nmi_sourcing = 1; | |
642 | + else | |
643 | + dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this " | |
644 | + "functionality you must reboot with nmi_watchdog=0.\n"); | |
645 | +} | |
646 | +#else | |
647 | +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) | |
648 | +{ | |
649 | + dev_warn(&dev->dev, "NMI sourcing is disabled. " | |
650 | + "Your kernel does not support a NMI Watchdog.\n"); | |
651 | +} | |
652 | +#endif | |
653 | + | |
630 | 654 | static int __devinit hpwdt_init_one(struct pci_dev *dev, |
631 | 655 | const struct pci_device_id *ent) |
632 | 656 | { |
633 | 657 | int retval; |
658 | + | |
659 | + /* | |
660 | + * Check if we can do NMI sourcing or not | |
661 | + */ | |
662 | + hpwdt_check_nmi_sourcing(dev); | |
634 | 663 | |
635 | 664 | /* |
636 | 665 | * First let's find out if we are on an iLO2 server. We will |