Commit 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417
Committed by
Linus Torvalds
1 parent
4732efbeb9
Exists in
master
and in
4 other branches
[PATCH] detect soft lockups
This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP. When enabled then per-CPU watchdog threads are started, which try to run once per second. If they get delayed for more than 10 seconds then a callback from the timer interrupt detects this condition and prints out a warning message and a stack dump (once per lockup incident). The feature is otherwise non-intrusive, it doesnt try to unlock the box in any way, it only gets the debug info out, automatically, and on all CPUs affected by the lockup. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de> Signed-off-by: Richard Purdie <rpurdie@rpsys.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 12 changed files with 201 additions and 0 deletions Side-by-side Diff
arch/i386/kernel/nmi.c
arch/i386/kernel/time.c
arch/x86_64/kernel/nmi.c
arch/x86_64/kernel/time.c
drivers/mtd/nand/nand_base.c
include/linux/sched.h
... | ... | @@ -176,6 +176,23 @@ |
176 | 176 | extern void update_process_times(int user); |
177 | 177 | extern void scheduler_tick(void); |
178 | 178 | |
179 | +#ifdef CONFIG_DETECT_SOFTLOCKUP | |
180 | +extern void softlockup_tick(struct pt_regs *regs); | |
181 | +extern void spawn_softlockup_task(void); | |
182 | +extern void touch_softlockup_watchdog(void); | |
183 | +#else | |
184 | +static inline void softlockup_tick(struct pt_regs *regs) | |
185 | +{ | |
186 | +} | |
187 | +static inline void spawn_softlockup_task(void) | |
188 | +{ | |
189 | +} | |
190 | +static inline void touch_softlockup_watchdog(void) | |
191 | +{ | |
192 | +} | |
193 | +#endif | |
194 | + | |
195 | + | |
179 | 196 | /* Attach to any functions which should be ignored in wchan output. */ |
180 | 197 | #define __sched __attribute__((__section__(".sched.text"))) |
181 | 198 | /* Is this address in the __sched functions? */ |
init/main.c
kernel/Makefile
... | ... | @@ -27,6 +27,7 @@ |
27 | 27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
28 | 28 | obj-$(CONFIG_KPROBES) += kprobes.o |
29 | 29 | obj-$(CONFIG_SYSFS) += ksysfs.o |
30 | +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |
30 | 31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
31 | 32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
32 | 33 | obj-$(CONFIG_SECCOMP) += seccomp.o |
kernel/power/swsusp.c
kernel/softlockup.c
1 | +/* | |
2 | + * Detect Soft Lockups | |
3 | + * | |
4 | + * started by Ingo Molnar, (C) 2005, Red Hat | |
5 | + * | |
6 | + * this code detects soft lockups: incidents in where on a CPU | |
7 | + * the kernel does not reschedule for 10 seconds or more. | |
8 | + */ | |
9 | + | |
10 | +#include <linux/mm.h> | |
11 | +#include <linux/cpu.h> | |
12 | +#include <linux/init.h> | |
13 | +#include <linux/delay.h> | |
14 | +#include <linux/kthread.h> | |
15 | +#include <linux/notifier.h> | |
16 | +#include <linux/module.h> | |
17 | + | |
18 | +static DEFINE_SPINLOCK(print_lock); | |
19 | + | |
20 | +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; | |
21 | +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; | |
22 | +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | |
23 | + | |
24 | +static int did_panic = 0; | |
25 | +static int softlock_panic(struct notifier_block *this, unsigned long event, | |
26 | + void *ptr) | |
27 | +{ | |
28 | + did_panic = 1; | |
29 | + | |
30 | + return NOTIFY_DONE; | |
31 | +} | |
32 | + | |
33 | +static struct notifier_block panic_block = { | |
34 | + .notifier_call = softlock_panic, | |
35 | +}; | |
36 | + | |
37 | +void touch_softlockup_watchdog(void) | |
38 | +{ | |
39 | + per_cpu(timestamp, raw_smp_processor_id()) = jiffies; | |
40 | +} | |
41 | +EXPORT_SYMBOL(touch_softlockup_watchdog); | |
42 | + | |
43 | +/* | |
44 | + * This callback runs from the timer interrupt, and checks | |
45 | + * whether the watchdog thread has hung or not: | |
46 | + */ | |
47 | +void softlockup_tick(struct pt_regs *regs) | |
48 | +{ | |
49 | + int this_cpu = smp_processor_id(); | |
50 | + unsigned long timestamp = per_cpu(timestamp, this_cpu); | |
51 | + | |
52 | + if (per_cpu(print_timestamp, this_cpu) == timestamp) | |
53 | + return; | |
54 | + | |
55 | + /* Do not cause a second panic when there already was one */ | |
56 | + if (did_panic) | |
57 | + return; | |
58 | + | |
59 | + if (time_after(jiffies, timestamp + 10*HZ)) { | |
60 | + per_cpu(print_timestamp, this_cpu) = timestamp; | |
61 | + | |
62 | + spin_lock(&print_lock); | |
63 | + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", | |
64 | + this_cpu); | |
65 | + show_regs(regs); | |
66 | + spin_unlock(&print_lock); | |
67 | + } | |
68 | +} | |
69 | + | |
70 | +/* | |
71 | + * The watchdog thread - runs every second and touches the timestamp. | |
72 | + */ | |
73 | +static int watchdog(void * __bind_cpu) | |
74 | +{ | |
75 | + struct sched_param param = { .sched_priority = 99 }; | |
76 | + int this_cpu = (long) __bind_cpu; | |
77 | + | |
78 | + printk("softlockup thread %d started up.\n", this_cpu); | |
79 | + | |
80 | + sched_setscheduler(current, SCHED_FIFO, ¶m); | |
81 | + current->flags |= PF_NOFREEZE; | |
82 | + | |
83 | + set_current_state(TASK_INTERRUPTIBLE); | |
84 | + | |
85 | + /* | |
86 | + * Run briefly once per second - if this gets delayed for | |
87 | + * more than 10 seconds then the debug-printout triggers | |
88 | + * in softlockup_tick(): | |
89 | + */ | |
90 | + while (!kthread_should_stop()) { | |
91 | + msleep_interruptible(1000); | |
92 | + touch_softlockup_watchdog(); | |
93 | + } | |
94 | + __set_current_state(TASK_RUNNING); | |
95 | + | |
96 | + return 0; | |
97 | +} | |
98 | + | |
99 | +/* | |
100 | + * Create/destroy watchdog threads as CPUs come and go: | |
101 | + */ | |
102 | +static int __devinit | |
103 | +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |
104 | +{ | |
105 | + int hotcpu = (unsigned long)hcpu; | |
106 | + struct task_struct *p; | |
107 | + | |
108 | + switch (action) { | |
109 | + case CPU_UP_PREPARE: | |
110 | + BUG_ON(per_cpu(watchdog_task, hotcpu)); | |
111 | + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | |
112 | + if (IS_ERR(p)) { | |
113 | + printk("watchdog for %i failed\n", hotcpu); | |
114 | + return NOTIFY_BAD; | |
115 | + } | |
116 | + per_cpu(watchdog_task, hotcpu) = p; | |
117 | + kthread_bind(p, hotcpu); | |
118 | + break; | |
119 | + case CPU_ONLINE: | |
120 | + | |
121 | + wake_up_process(per_cpu(watchdog_task, hotcpu)); | |
122 | + break; | |
123 | +#ifdef CONFIG_HOTPLUG_CPU | |
124 | + case CPU_UP_CANCELED: | |
125 | + /* Unbind so it can run. Fall thru. */ | |
126 | + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); | |
127 | + case CPU_DEAD: | |
128 | + p = per_cpu(watchdog_task, hotcpu); | |
129 | + per_cpu(watchdog_task, hotcpu) = NULL; | |
130 | + kthread_stop(p); | |
131 | + break; | |
132 | +#endif /* CONFIG_HOTPLUG_CPU */ | |
133 | + } | |
134 | + return NOTIFY_OK; | |
135 | +} | |
136 | + | |
137 | +static struct notifier_block __devinitdata cpu_nfb = { | |
138 | + .notifier_call = cpu_callback | |
139 | +}; | |
140 | + | |
141 | +__init void spawn_softlockup_task(void) | |
142 | +{ | |
143 | + void *cpu = (void *)(long)smp_processor_id(); | |
144 | + | |
145 | + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | |
146 | + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | |
147 | + register_cpu_notifier(&cpu_nfb); | |
148 | + | |
149 | + notifier_chain_register(&panic_notifier_list, &panic_block); | |
150 | +} |
kernel/timer.c
lib/Kconfig.debug
... | ... | @@ -46,6 +46,25 @@ |
46 | 46 | 13 => 8 KB |
47 | 47 | 12 => 4 KB |
48 | 48 | |
49 | +config DETECT_SOFTLOCKUP | |
50 | + bool "Detect Soft Lockups" | |
51 | + depends on DEBUG_KERNEL | |
52 | + default y | |
53 | + help | |
54 | + Say Y here to enable the kernel to detect "soft lockups", | |
55 | + which are bugs that cause the kernel to loop in kernel | |
56 | + mode for more than 10 seconds, without giving other tasks a | |
57 | + chance to run. | |
58 | + | |
59 | + When a soft-lockup is detected, the kernel will print the | |
60 | + current stack trace (which you should report), but the | |
61 | + system will stay locked up. This feature has negligible | |
62 | + overhead. | |
63 | + | |
64 | + (Note that "hard lockups" are separate type of bugs that | |
65 | + can be detected via the NMI-watchdog, on platforms that | |
66 | + support it.) | |
67 | + | |
49 | 68 | config SCHEDSTATS |
50 | 69 | bool "Collect scheduler statistics" |
51 | 70 | depends on DEBUG_KERNEL && PROC_FS |