Commit 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417

Authored by Ingo Molnar
Committed by Linus Torvalds
1 parent 4732efbeb9

[PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 12 changed files with 201 additions and 0 deletions Side-by-side Diff

arch/i386/kernel/nmi.c
... ... @@ -478,6 +478,11 @@
478 478 */
479 479 for (i = 0; i < NR_CPUS; i++)
480 480 alert_counter[i] = 0;
  481 +
  482 + /*
  483 + * Tickle the softlockup detector too:
  484 + */
  485 + touch_softlockup_watchdog();
481 486 }
482 487  
483 488 extern void die_nmi(struct pt_regs *, const char *msg);
arch/i386/kernel/time.c
... ... @@ -422,6 +422,7 @@
422 422 last_timer->resume();
423 423 cur_timer = last_timer;
424 424 last_timer = NULL;
  425 + touch_softlockup_watchdog();
425 426 return 0;
426 427 }
427 428  
arch/x86_64/kernel/nmi.c
... ... @@ -463,6 +463,8 @@
463 463 */
464 464 for (i = 0; i < NR_CPUS; i++)
465 465 per_cpu(nmi_touch, i) = 1;
  466 +
  467 + touch_softlockup_watchdog();
466 468 }
467 469  
468 470 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
arch/x86_64/kernel/time.c
... ... @@ -1041,6 +1041,7 @@
1041 1041 write_sequnlock_irqrestore(&xtime_lock,flags);
1042 1042 jiffies += sleep_length;
1043 1043 wall_jiffies += sleep_length;
  1044 + touch_softlockup_watchdog();
1044 1045 return 0;
1045 1046 }
1046 1047  
drivers/mtd/nand/nand_base.c
... ... @@ -526,6 +526,7 @@
526 526 do {
527 527 if (this->dev_ready(mtd))
528 528 return;
  529 + touch_softlockup_watchdog();
529 530 } while (time_before(jiffies, timeo));
530 531 }
531 532  
include/linux/sched.h
... ... @@ -176,6 +176,23 @@
176 176 extern void update_process_times(int user);
177 177 extern void scheduler_tick(void);
178 178  
  179 +#ifdef CONFIG_DETECT_SOFTLOCKUP
  180 +extern void softlockup_tick(struct pt_regs *regs);
  181 +extern void spawn_softlockup_task(void);
  182 +extern void touch_softlockup_watchdog(void);
  183 +#else
  184 +static inline void softlockup_tick(struct pt_regs *regs)
  185 +{
  186 +}
  187 +static inline void spawn_softlockup_task(void)
  188 +{
  189 +}
  190 +static inline void touch_softlockup_watchdog(void)
  191 +{
  192 +}
  193 +#endif
  194 +
  195 +
179 196 /* Attach to any functions which should be ignored in wchan output. */
180 197 #define __sched __attribute__((__section__(".sched.text")))
181 198 /* Is this address in the __sched functions? */
... ... @@ -614,6 +614,7 @@
614 614 migration_init();
615 615 #endif
616 616 spawn_ksoftirqd();
  617 + spawn_softlockup_task();
617 618 }
618 619  
619 620 static void run_init_process(char *init_filename)
... ... @@ -27,6 +27,7 @@
27 27 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
28 28 obj-$(CONFIG_KPROBES) += kprobes.o
29 29 obj-$(CONFIG_SYSFS) += ksysfs.o
  30 +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
30 31 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31 32 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
32 33 obj-$(CONFIG_SECCOMP) += seccomp.o
kernel/power/swsusp.c
... ... @@ -1059,6 +1059,7 @@
1059 1059 BUG_ON(!error);
1060 1060 restore_processor_state();
1061 1061 restore_highmem();
  1062 + touch_softlockup_watchdog();
1062 1063 device_power_up();
1063 1064 local_irq_enable();
1064 1065 return error;
  1 +/*
  2 + * Detect Soft Lockups
  3 + *
  4 + * started by Ingo Molnar, (C) 2005, Red Hat
  5 + *
  6 + * this code detects soft lockups: incidents in where on a CPU
  7 + * the kernel does not reschedule for 10 seconds or more.
  8 + */
  9 +
  10 +#include <linux/mm.h>
  11 +#include <linux/cpu.h>
  12 +#include <linux/init.h>
  13 +#include <linux/delay.h>
  14 +#include <linux/kthread.h>
  15 +#include <linux/notifier.h>
  16 +#include <linux/module.h>
  17 +
  18 +static DEFINE_SPINLOCK(print_lock);
  19 +
  20 +static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
  21 +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
  22 +static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
  23 +
  24 +static int did_panic = 0;
  25 +static int softlock_panic(struct notifier_block *this, unsigned long event,
  26 + void *ptr)
  27 +{
  28 + did_panic = 1;
  29 +
  30 + return NOTIFY_DONE;
  31 +}
  32 +
  33 +static struct notifier_block panic_block = {
  34 + .notifier_call = softlock_panic,
  35 +};
  36 +
  37 +void touch_softlockup_watchdog(void)
  38 +{
  39 + per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
  40 +}
  41 +EXPORT_SYMBOL(touch_softlockup_watchdog);
  42 +
  43 +/*
  44 + * This callback runs from the timer interrupt, and checks
  45 + * whether the watchdog thread has hung or not:
  46 + */
  47 +void softlockup_tick(struct pt_regs *regs)
  48 +{
  49 + int this_cpu = smp_processor_id();
  50 + unsigned long timestamp = per_cpu(timestamp, this_cpu);
  51 +
  52 + if (per_cpu(print_timestamp, this_cpu) == timestamp)
  53 + return;
  54 +
  55 + /* Do not cause a second panic when there already was one */
  56 + if (did_panic)
  57 + return;
  58 +
  59 + if (time_after(jiffies, timestamp + 10*HZ)) {
  60 + per_cpu(print_timestamp, this_cpu) = timestamp;
  61 +
  62 + spin_lock(&print_lock);
  63 + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
  64 + this_cpu);
  65 + show_regs(regs);
  66 + spin_unlock(&print_lock);
  67 + }
  68 +}
  69 +
  70 +/*
  71 + * The watchdog thread - runs every second and touches the timestamp.
  72 + */
  73 +static int watchdog(void * __bind_cpu)
  74 +{
  75 + struct sched_param param = { .sched_priority = 99 };
  76 + int this_cpu = (long) __bind_cpu;
  77 +
  78 + printk("softlockup thread %d started up.\n", this_cpu);
  79 +
  80 + sched_setscheduler(current, SCHED_FIFO, &param);
  81 + current->flags |= PF_NOFREEZE;
  82 +
  83 + set_current_state(TASK_INTERRUPTIBLE);
  84 +
  85 + /*
  86 + * Run briefly once per second - if this gets delayed for
  87 + * more than 10 seconds then the debug-printout triggers
  88 + * in softlockup_tick():
  89 + */
  90 + while (!kthread_should_stop()) {
  91 + msleep_interruptible(1000);
  92 + touch_softlockup_watchdog();
  93 + }
  94 + __set_current_state(TASK_RUNNING);
  95 +
  96 + return 0;
  97 +}
  98 +
  99 +/*
  100 + * Create/destroy watchdog threads as CPUs come and go:
  101 + */
  102 +static int __devinit
  103 +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  104 +{
  105 + int hotcpu = (unsigned long)hcpu;
  106 + struct task_struct *p;
  107 +
  108 + switch (action) {
  109 + case CPU_UP_PREPARE:
  110 + BUG_ON(per_cpu(watchdog_task, hotcpu));
  111 + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
  112 + if (IS_ERR(p)) {
  113 + printk("watchdog for %i failed\n", hotcpu);
  114 + return NOTIFY_BAD;
  115 + }
  116 + per_cpu(watchdog_task, hotcpu) = p;
  117 + kthread_bind(p, hotcpu);
  118 + break;
  119 + case CPU_ONLINE:
  120 +
  121 + wake_up_process(per_cpu(watchdog_task, hotcpu));
  122 + break;
  123 +#ifdef CONFIG_HOTPLUG_CPU
  124 + case CPU_UP_CANCELED:
  125 + /* Unbind so it can run. Fall thru. */
  126 + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
  127 + case CPU_DEAD:
  128 + p = per_cpu(watchdog_task, hotcpu);
  129 + per_cpu(watchdog_task, hotcpu) = NULL;
  130 + kthread_stop(p);
  131 + break;
  132 +#endif /* CONFIG_HOTPLUG_CPU */
  133 + }
  134 + return NOTIFY_OK;
  135 +}
  136 +
  137 +static struct notifier_block __devinitdata cpu_nfb = {
  138 + .notifier_call = cpu_callback
  139 +};
  140 +
  141 +__init void spawn_softlockup_task(void)
  142 +{
  143 + void *cpu = (void *)(long)smp_processor_id();
  144 +
  145 + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
  146 + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
  147 + register_cpu_notifier(&cpu_nfb);
  148 +
  149 + notifier_chain_register(&panic_notifier_list, &panic_block);
  150 +}
... ... @@ -950,6 +950,7 @@
950 950 {
951 951 jiffies_64++;
952 952 update_times();
  953 + softlockup_tick(regs);
953 954 }
954 955  
955 956 #ifdef __ARCH_WANT_SYS_ALARM
... ... @@ -46,6 +46,25 @@
46 46 13 => 8 KB
47 47 12 => 4 KB
48 48  
  49 +config DETECT_SOFTLOCKUP
  50 + bool "Detect Soft Lockups"
  51 + depends on DEBUG_KERNEL
  52 + default y
  53 + help
  54 + Say Y here to enable the kernel to detect "soft lockups",
  55 + which are bugs that cause the kernel to loop in kernel
  56 + mode for more than 10 seconds, without giving other tasks a
  57 + chance to run.
  58 +
  59 + When a soft-lockup is detected, the kernel will print the
  60 + current stack trace (which you should report), but the
  61 + system will stay locked up. This feature has negligible
  62 + overhead.
  63 +
  64 + (Note that "hard lockups" are separate type of bugs that
  65 + can be detected via the NMI-watchdog, on platforms that
  66 + support it.)
  67 +
49 68 config SCHEDSTATS
50 69 bool "Collect scheduler statistics"
51 70 depends on DEBUG_KERNEL && PROC_FS