Blame view
kernel/watchdog.c
13.2 KB
58687acba lockup_detector: ... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
/* * Detect hard and soft lockups on a system * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * * this code detects hard lockups: incidents in where on a CPU * the kernel does not respond to anything except NMI. * * Note: Most of this code is borrowed heavily from softlockup.c, * so thanks to Ingo for the initial implementation. * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks * to those contributors as well. */ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/lockdep.h> #include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> #include <asm/irq_regs.h> #include <linux/perf_event.h> int watchdog_enabled; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); |
58687acba lockup_detector: ... |
37 |
static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
23637d477 lockup_detector: ... |
38 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
cafcd80d2 lockup_detector: ... |
39 40 |
static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
58687acba lockup_detector: ... |
41 42 43 44 |
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif |
58687acba lockup_detector: ... |
45 46 47 48 49 50 51 |
static int __initdata no_watchdog; /* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ |
23637d477 lockup_detector: ... |
52 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
static int hardlockup_panic; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { no_watchdog = 1; return 1; } __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { no_watchdog = 1; return 1; } __setup("nosoftlockup", nosoftlockup_setup); /* */ /* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ static unsigned long get_timestamp(int this_cpu) { return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } static unsigned long get_sample_period(void) { /* * convert softlockup_thresh from seconds to ns * the divide by 5 is to give hrtimer 5 chances to * increment before the hardlockup detector generates * a warning */ return softlockup_thresh / 5 * NSEC_PER_SEC; } /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { |
26e09c6ee lockup_detector: ... |
116 |
int this_cpu = smp_processor_id(); |
58687acba lockup_detector: ... |
117 118 119 |
__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); } |
332fbdbca lockup_detector: ... |
120 |
void touch_softlockup_watchdog(void) |
58687acba lockup_detector: ... |
121 |
{ |
68d3f1d81 lockup_detector: ... |
122 |
__raw_get_cpu_var(watchdog_touch_ts) = 0; |
58687acba lockup_detector: ... |
123 |
} |
0167c7819 watchdog: Export ... |
124 |
EXPORT_SYMBOL(touch_softlockup_watchdog); |
58687acba lockup_detector: ... |
125 |
|
332fbdbca lockup_detector: ... |
126 |
void touch_all_softlockup_watchdogs(void) |
58687acba lockup_detector: ... |
127 128 129 130 131 132 133 134 135 136 137 |
{ int cpu; /* * this is done lockless * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ for_each_online_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; } |
cafcd80d2 lockup_detector: ... |
138 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
139 140 |
void touch_nmi_watchdog(void) { |
68d3f1d81 lockup_detector: ... |
141 142 143 144 145 146 147 148 |
if (watchdog_enabled) { unsigned cpu; for_each_present_cpu(cpu) { if (per_cpu(watchdog_nmi_touch, cpu) != true) per_cpu(watchdog_nmi_touch, cpu) = true; } } |
332fbdbca lockup_detector: ... |
149 |
touch_softlockup_watchdog(); |
58687acba lockup_detector: ... |
150 151 |
} EXPORT_SYMBOL(touch_nmi_watchdog); |
cafcd80d2 lockup_detector: ... |
152 |
#endif |
58687acba lockup_detector: ... |
153 154 155 156 157 |
void touch_softlockup_watchdog_sync(void) { __raw_get_cpu_var(softlockup_touch_sync) = true; __raw_get_cpu_var(watchdog_touch_ts) = 0; } |
23637d477 lockup_detector: ... |
158 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
159 |
/* watchdog detector functions */ |
26e09c6ee lockup_detector: ... |
160 |
static int is_hardlockup(void) |
58687acba lockup_detector: ... |
161 |
{ |
26e09c6ee lockup_detector: ... |
162 |
unsigned long hrint = __get_cpu_var(hrtimer_interrupts); |
58687acba lockup_detector: ... |
163 |
|
26e09c6ee lockup_detector: ... |
164 |
if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) |
58687acba lockup_detector: ... |
165 |
return 1; |
26e09c6ee lockup_detector: ... |
166 |
__get_cpu_var(hrtimer_interrupts_saved) = hrint; |
58687acba lockup_detector: ... |
167 168 169 |
return 0; } #endif |
26e09c6ee lockup_detector: ... |
170 |
static int is_softlockup(unsigned long touch_ts) |
58687acba lockup_detector: ... |
171 |
{ |
26e09c6ee lockup_detector: ... |
172 |
unsigned long now = get_timestamp(smp_processor_id()); |
58687acba lockup_detector: ... |
173 174 175 176 177 178 179 |
/* Warn about unreasonable delays: */ if (time_after(now, touch_ts + softlockup_thresh)) return now - touch_ts; return 0; } |
23637d477 lockup_detector: ... |
180 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
181 182 183 184 185 186 187 188 189 190 191 192 193 |
static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .size = sizeof(struct perf_event_attr), .pinned = 1, .disabled = 1, }; /* Callback function for perf event subsystem */ void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { |
c6db67cda watchdog: Don't t... |
194 195 |
/* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; |
d7c547335 lockup_detector: ... |
196 197 |
if (__get_cpu_var(watchdog_nmi_touch) == true) { __get_cpu_var(watchdog_nmi_touch) = false; |
58687acba lockup_detector: ... |
198 199 200 201 202 203 204 205 206 |
return; } /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ |
26e09c6ee lockup_detector: ... |
207 208 |
if (is_hardlockup()) { int this_cpu = smp_processor_id(); |
58687acba lockup_detector: ... |
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
/* only print hardlockups once */ if (__get_cpu_var(hard_watchdog_warn) == true) return; if (hardlockup_panic) panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); else WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); __get_cpu_var(hard_watchdog_warn) = true; return; } __get_cpu_var(hard_watchdog_warn) = false; return; } static void watchdog_interrupt_count(void) { __get_cpu_var(hrtimer_interrupts)++; } #else static inline void watchdog_interrupt_count(void) { return; } |
23637d477 lockup_detector: ... |
231 |
#endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
58687acba lockup_detector: ... |
232 233 234 235 |
/* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { |
58687acba lockup_detector: ... |
236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; /* kick the hardlockup detector */ watchdog_interrupt_count(); /* kick the softlockup detector */ wake_up_process(__get_cpu_var(softlockup_watchdog)); /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); if (touch_ts == 0) { |
26e09c6ee lockup_detector: ... |
250 |
if (unlikely(__get_cpu_var(softlockup_touch_sync))) { |
58687acba lockup_detector: ... |
251 252 253 254 |
/* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ |
26e09c6ee lockup_detector: ... |
255 |
__get_cpu_var(softlockup_touch_sync) = false; |
58687acba lockup_detector: ... |
256 257 258 259 260 261 262 263 264 265 266 267 |
sched_clock_tick(); } __touch_watchdog(); return HRTIMER_RESTART; } /* check for a softlockup * This is done by making sure a high priority task is * being scheduled. The task touches the watchdog to * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ |
26e09c6ee lockup_detector: ... |
268 |
duration = is_softlockup(touch_ts); |
58687acba lockup_detector: ... |
269 270 271 272 273 274 275 |
if (unlikely(duration)) { /* only warn once */ if (__get_cpu_var(soft_watchdog_warn) == true) return HRTIMER_RESTART; printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d] ", |
26e09c6ee lockup_detector: ... |
276 |
smp_processor_id(), duration, |
58687acba lockup_detector: ... |
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
current->comm, task_pid_nr(current)); print_modules(); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); if (softlockup_panic) panic("softlockup: hung tasks"); __get_cpu_var(soft_watchdog_warn) = true; } else __get_cpu_var(soft_watchdog_warn) = false; return HRTIMER_RESTART; } /* * The watchdog thread - touches the timestamp. */ |
26e09c6ee lockup_detector: ... |
298 |
static int watchdog(void *unused) |
58687acba lockup_detector: ... |
299 300 |
{ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
26e09c6ee lockup_detector: ... |
301 |
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
58687acba lockup_detector: ... |
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
sched_setscheduler(current, SCHED_FIFO, ¶m); /* initialize timestamp */ __touch_watchdog(); /* kick off the timer for the hardlockup detector */ /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), HRTIMER_MODE_REL_PINNED); set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the |
26e09c6ee lockup_detector: ... |
317 |
* debug-printout triggers in watchdog_timer_fn(). |
58687acba lockup_detector: ... |
318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
*/ while (!kthread_should_stop()) { __touch_watchdog(); schedule(); if (kthread_should_stop()) break; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } |
23637d477 lockup_detector: ... |
332 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; /* it is setup but not enabled */ if (event != NULL) goto out_enable; /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(); |
38a81da22 perf events: Clea... |
349 |
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
58687acba lockup_detector: ... |
350 351 352 353 354 355 356 357 |
if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter. "); goto out_save; } printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p ", cpu, event); |
eac243355 lockup_detector: ... |
358 |
return PTR_ERR(event); |
58687acba lockup_detector: ... |
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 |
/* success path */ out_save: per_cpu(watchdog_ev, cpu) = event; out_enable: perf_event_enable(per_cpu(watchdog_ev, cpu)); out: return 0; } static void watchdog_nmi_disable(int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); if (event) { perf_event_disable(event); per_cpu(watchdog_ev, cpu) = NULL; /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); } return; } #else static int watchdog_nmi_enable(int cpu) { return 0; } static void watchdog_nmi_disable(int cpu) { return; } |
23637d477 lockup_detector: ... |
385 |
#endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
58687acba lockup_detector: ... |
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 |
/* prepare/enable/disable routines */ static int watchdog_prepare_cpu(int cpu) { struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); WARN_ON(per_cpu(softlockup_watchdog, cpu)); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; return 0; } static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); |
eac243355 lockup_detector: ... |
402 |
int err; |
58687acba lockup_detector: ... |
403 404 |
/* enable the perf event */ |
eac243355 lockup_detector: ... |
405 406 407 |
err = watchdog_nmi_enable(cpu); if (err) return err; |
58687acba lockup_detector: ... |
408 409 410 411 412 413 414 |
/* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed ", cpu); |
eac243355 lockup_detector: ... |
415 |
return PTR_ERR(p); |
58687acba lockup_detector: ... |
416 417 418 419 420 421 |
} kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(softlockup_watchdog, cpu) = p; wake_up_process(p); } |
68d3f1d81 lockup_detector: ... |
422 423 |
/* if any cpu succeeds, watchdog is considered enabled for the system */ watchdog_enabled = 1; |
58687acba lockup_detector: ... |
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
return 0; } static void watchdog_disable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); /* * cancel the timer first to stop incrementing the stats * and waking up the kthread */ hrtimer_cancel(hrtimer); /* disable the perf event */ watchdog_nmi_disable(cpu); /* stop the watchdog thread */ if (p) { per_cpu(softlockup_watchdog, cpu) = NULL; kthread_stop(p); } |
58687acba lockup_detector: ... |
446 447 448 449 450 |
} static void watchdog_enable_all_cpus(void) { int cpu; |
eb703f981 kernel/watchdog: ... |
451 |
int result = 0; |
58687acba lockup_detector: ... |
452 453 454 455 456 457 458 459 460 461 462 463 464 |
for_each_online_cpu(cpu) result += watchdog_enable(cpu); if (result) printk(KERN_ERR "watchdog: failed to be enabled on some cpus "); } static void watchdog_disable_all_cpus(void) { int cpu; |
d9ca07a05 watchdog: Avoid k... |
465 466 |
if (no_watchdog) return; |
58687acba lockup_detector: ... |
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 |
for_each_online_cpu(cpu) watchdog_disable(cpu); /* if all watchdogs are disabled, then they are disabled for the system */ watchdog_enabled = 0; } /* sysctl functions */ #ifdef CONFIG_SYSCTL /* * proc handler for /proc/sys/kernel/nmi_watchdog */ int proc_dowatchdog_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); if (watchdog_enabled) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); return 0; } int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } |
58687acba lockup_detector: ... |
499 500 501 502 503 504 505 506 507 508 |
#endif /* CONFIG_SYSCTL */ /* * Create/destroy watchdog threads as CPUs come and go: */ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; |
eac243355 lockup_detector: ... |
509 |
int err = 0; |
58687acba lockup_detector: ... |
510 511 512 513 |
switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: |
eac243355 lockup_detector: ... |
514 |
err = watchdog_prepare_cpu(hotcpu); |
58687acba lockup_detector: ... |
515 516 517 |
break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: |
eac243355 lockup_detector: ... |
518 |
err = watchdog_enable(hotcpu); |
58687acba lockup_detector: ... |
519 520 521 522 523 524 525 526 527 528 529 530 |
break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: watchdog_disable(hotcpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: watchdog_disable(hotcpu); break; #endif /* CONFIG_HOTPLUG_CPU */ } |
eac243355 lockup_detector: ... |
531 |
return notifier_from_errno(err); |
58687acba lockup_detector: ... |
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 |
} static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; static int __init spawn_watchdog_task(void) { void *cpu = (void *)(long)smp_processor_id(); int err; if (no_watchdog) return 0; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
eac243355 lockup_detector: ... |
547 |
WARN_ON(notifier_to_errno(err)); |
58687acba lockup_detector: ... |
548 549 550 |
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); |
58687acba lockup_detector: ... |
551 552 553 |
return 0; } early_initcall(spawn_watchdog_task); |