Commit 47bece87b14b866872b52ff04d469832e4936756

Authored by Thomas Mingarelli
Committed by Wim Van Sebroeck
1 parent 55e8ddecec

[WATCHDOG] hpwdt: Add NMI sourcing

Add NMI sourcing functionality (Can only be active if nmi_watchdog is
inactive).

Signed-off-by: Thomas Mingarelli <thomas.mingarelli@hp.com>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>

Showing 2 changed files with 128 additions and 15 deletions Side-by-side Diff

Documentation/watchdog/hpwdt.txt
  1 +Last reviewed: 06/02/2009
  2 +
  3 + HP iLO2 NMI Watchdog Driver
  4 + NMI sourcing for iLO2 based ProLiant Servers
  5 + Documentation and Driver by
  6 + Thomas Mingarelli <thomas.mingarelli@hp.com>
  7 +
  8 + The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
  9 + watchdog functionality and the added benefit of NMI sourcing. Both the
  10 + watchdog functionality and the NMI sourcing capability need to be enabled
  11 + by the user. Remember that the two modes are not dependant on one another.
  12 + A user can have the NMI sourcing without the watchdog timer and vice-versa.
  13 +
  14 + Watchdog functionality is enabled like any other common watchdog driver. That
  15 + is, an application needs to be started that kicks off the watchdog timer. A
  16 + basic application exists in the Documentation/watchdog/src directory called
  17 + watchdog-test.c. Simply compile the C file and kick it off. If the system
  18 + gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
  19 + not be updated in a timely fashion and a hardware system reset (also known as
  20 + an Automatic Server Recovery (ASR)) event will occur.
  21 +
  22 + The hpwdt driver also has three (3) module parameters. They are the following:
  23 +
  24 + soft_margin - allows the user to set the watchdog timer value
  25 + allow_kdump - allows the user to save off a kernel dump image after an NMI
  26 + nowayout - basic watchdog parameter that does not allow the timer to
  27 + be restarted or an impending ASR to be escaped.
  28 +
  29 + NOTE: More information about watchdog drivers in general, including the ioctl
  30 + interface to /dev/watchdog can be found in
  31 + Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
  32 +
  33 + The NMI sourcing capability is disabled when the driver discovers that the
  34 + nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to
  35 + distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
  36 + Linux kernel. What this means is that the hpwdt nmi handler code is called
  37 + each time the NMI signal fires off. This could amount to several thousands of
  38 + NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
  39 + confused" message in the logs or if the system gets into a hung state, then
  40 + the user should reboot with nmi_watchdog=0.
  41 +
  42 + 1. If the kernel has not been booted with nmi_watchdog turned off then
  43 + edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
  44 + currently booting kernel line.
  45 + 2. reboot the sever
  46 +
  47 + Now, the hpwdt can successfully receive and source the NMI and provide a log
  48 + message that details the reason for the NMI (as determined by the HP BIOS).
  49 +
  50 + Below is a list of NMIs the HP BIOS understands along with the associated
  51 + code (reason):
  52 +
  53 + No source found 00h
  54 +
  55 + Uncorrectable Memory Error 01h
  56 +
  57 + ASR NMI 1Bh
  58 +
  59 + PCI Parity Error 20h
  60 +
  61 + NMI Button Press 27h
  62 +
  63 + SB_BUS_NMI 28h
  64 +
  65 + ILO Doorbell NMI 29h
  66 +
  67 + ILO IOP NMI 2Ah
  68 +
  69 + ILO Watchdog NMI 2Bh
  70 +
  71 + Proc Throt NMI 2Ch
  72 +
  73 + Front Side Bus NMI 2Dh
  74 +
  75 + PCI Express Error 2Fh
  76 +
  77 + DMA controller NMI 30h
  78 +
  79 + Hypertransport/CSI Error 31h
  80 +
  81 +
  82 +
  83 + -- Tom Mingarelli
  84 + (thomas.mingarelli@hp.com)
drivers/watchdog/hpwdt.c
... ... @@ -19,6 +19,7 @@
19 19 #include <linux/interrupt.h>
20 20 #include <linux/io.h>
21 21 #include <linux/irq.h>
  22 +#include <linux/nmi.h>
22 23 #include <linux/kernel.h>
23 24 #include <linux/miscdevice.h>
24 25 #include <linux/mm.h>
... ... @@ -47,7 +48,7 @@
47 48 #define PCI_BIOS32_PARAGRAPH_LEN 16
48 49 #define PCI_ROM_BASE1 0x000F0000
49 50 #define ROM_SIZE 0x10000
50   -#define HPWDT_VERSION "1.01"
  51 +#define HPWDT_VERSION "1.1.1"
51 52  
52 53 struct bios32_service_dir {
53 54 u32 signature;
... ... @@ -119,6 +120,7 @@
119 120 static char expect_release;
120 121 static unsigned long hpwdt_is_open;
121 122 static unsigned int allow_kdump;
  123 +static int hpwdt_nmi_sourcing;
122 124  
123 125 static void __iomem *pci_mem_addr; /* the PCI-memory address */
124 126 static unsigned long __iomem *hpwdt_timer_reg;
125 127  
... ... @@ -468,21 +470,22 @@
468 470 if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
469 471 return NOTIFY_OK;
470 472  
471   - spin_lock_irqsave(&rom_lock, rom_pl);
472   - if (!die_nmi_called)
473   - asminline_call(&cmn_regs, cru_rom_addr);
474   - die_nmi_called = 1;
475   - spin_unlock_irqrestore(&rom_lock, rom_pl);
476   - if (cmn_regs.u1.ral == 0) {
477   - printk(KERN_WARNING "hpwdt: An NMI occurred, "
478   - "but unable to determine source.\n");
479   - } else {
480   - if (allow_kdump)
481   - hpwdt_stop();
482   - panic("An NMI occurred, please see the Integrated "
483   - "Management Log for details.\n");
  473 + if (hpwdt_nmi_sourcing) {
  474 + spin_lock_irqsave(&rom_lock, rom_pl);
  475 + if (!die_nmi_called)
  476 + asminline_call(&cmn_regs, cru_rom_addr);
  477 + die_nmi_called = 1;
  478 + spin_unlock_irqrestore(&rom_lock, rom_pl);
  479 + if (cmn_regs.u1.ral == 0) {
  480 + printk(KERN_WARNING "hpwdt: An NMI occurred, "
  481 + "but unable to determine source.\n");
  482 + } else {
  483 + if (allow_kdump)
  484 + hpwdt_stop();
  485 + panic("An NMI occurred, please see the Integrated "
  486 + "Management Log for details.\n");
  487 + }
484 488 }
485   -
486 489 return NOTIFY_OK;
487 490 }
488 491  
489 492  
... ... @@ -627,10 +630,36 @@
627 630 * Init & Exit
628 631 */
629 632  
  633 +#ifdef ARCH_HAS_NMI_WATCHDOG
  634 +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
  635 +{
  636 + /*
  637 + * If nmi_watchdog is turned off then we can turn on
  638 + * our nmi sourcing capability.
  639 + */
  640 + if (!nmi_watchdog_active())
  641 + hpwdt_nmi_sourcing = 1;
  642 + else
  643 + dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this "
  644 + "functionality you must reboot with nmi_watchdog=0.\n");
  645 +}
  646 +#else
  647 +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
  648 +{
  649 + dev_warn(&dev->dev, "NMI sourcing is disabled. "
  650 + "Your kernel does not support a NMI Watchdog.\n");
  651 +}
  652 +#endif
  653 +
630 654 static int __devinit hpwdt_init_one(struct pci_dev *dev,
631 655 const struct pci_device_id *ent)
632 656 {
633 657 int retval;
  658 +
  659 + /*
  660 + * Check if we can do NMI sourcing or not
  661 + */
  662 + hpwdt_check_nmi_sourcing(dev);
634 663  
635 664 /*
636 665 * First let's find out if we are on an iLO2 server. We will