Blame view
kernel/watchdog.c
26 KB
58687acba lockup_detector: ... |
1 2 3 4 5 |
/* * Detect hard and soft lockups on a system * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * |
86f5e6a7b watchdog: Fix cod... |
6 7 8 |
* Note: Most of this code is borrowed heavily from the original softlockup * detector, so thanks to Ingo for the initial implementation. * Some chunks also taken from the old x86-specific nmi watchdog code, thanks |
58687acba lockup_detector: ... |
9 10 |
* to those contributors as well. */ |
4501980aa kernel/watchdog.c... |
11 |
#define pr_fmt(fmt) "NMI watchdog: " fmt |
58687acba lockup_detector: ... |
12 13 14 15 |
#include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> |
58687acba lockup_detector: ... |
16 17 |
#include <linux/module.h> #include <linux/sysctl.h> |
bcd951cf1 watchdog: Use hot... |
18 |
#include <linux/smpboot.h> |
8bd75c77b sched/rt: Move rt... |
19 |
#include <linux/sched/rt.h> |
fe4ba3c34 watchdog: add wat... |
20 |
#include <linux/tick.h> |
58687acba lockup_detector: ... |
21 22 |
#include <asm/irq_regs.h> |
5d1c0f4a8 watchdog: add che... |
23 |
#include <linux/kvm_para.h> |
58687acba lockup_detector: ... |
24 |
#include <linux/perf_event.h> |
84d56e66b watchdog: new def... |
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
/* * The run state of the lockup detectors is controlled by the content of the * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. * * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' * are variables that are only used as an 'interface' between the parameters * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The * 'watchdog_thresh' variable is handled differently because its value is not * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' * is equal zero. */ #define NMI_WATCHDOG_ENABLED_BIT 0 #define SOFT_WATCHDOG_ENABLED_BIT 1 #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) |
ab992dc38 watchdog: Fix mer... |
41 |
static DEFINE_MUTEX(watchdog_proc_mutex); |
84d56e66b watchdog: new def... |
42 43 44 45 46 47 48 49 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; int __read_mostly watchdog_user_enabled; |
4eec42f39 watchdog: Change ... |
50 |
int __read_mostly watchdog_thresh = 10; |
84d56e66b watchdog: new def... |
51 |
|
ed235875e kernel/watchdog.c... |
52 53 54 55 56 |
#ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 #endif |
fe4ba3c34 watchdog: add wat... |
57 58 59 60 61 62 |
static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); /* Helper for online, unparked cpus. */ #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) |
ed235875e kernel/watchdog.c... |
63 |
|
3c00ea82c watchdog: Rename ... |
64 |
static int __read_mostly watchdog_running; |
0f34c4009 watchdog: store t... |
65 |
static u64 __read_mostly sample_period; |
58687acba lockup_detector: ... |
66 67 68 69 70 |
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); |
58687acba lockup_detector: ... |
71 |
static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
bcd951cf1 watchdog: Use hot... |
72 73 |
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
b1a8de1f5 softlockup: make ... |
74 |
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); |
23637d477 lockup_detector: ... |
75 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
cafcd80d2 lockup_detector: ... |
76 77 |
static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
58687acba lockup_detector: ... |
78 79 80 |
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif |
ed235875e kernel/watchdog.c... |
81 |
static unsigned long soft_lockup_nmi_warn; |
58687acba lockup_detector: ... |
82 |
|
58687acba lockup_detector: ... |
83 84 85 86 |
/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ |
23637d477 lockup_detector: ... |
87 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
fef2c9bc1 kernel/watchdog.c... |
88 89 |
static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
6e7458a6f kernel/watchdog.c... |
90 91 92 93 94 95 96 97 |
/* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these * cases this function can be called to disable hard lockup detection. This * function should only be executed once by the boot processor before the * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ |
692297d8f watchdog: introdu... |
98 |
void hardlockup_detector_disable(void) |
6e7458a6f kernel/watchdog.c... |
99 |
{ |
692297d8f watchdog: introdu... |
100 |
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; |
6e7458a6f kernel/watchdog.c... |
101 |
} |
58687acba lockup_detector: ... |
102 103 104 105 |
static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; |
fef2c9bc1 kernel/watchdog.c... |
106 107 |
else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; |
5dc305587 x86, NMI: Add bac... |
108 |
else if (!strncmp(str, "0", 1)) |
195daf665 watchdog: enable ... |
109 110 111 |
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; else if (!strncmp(str, "1", 1)) watchdog_enabled |= NMI_WATCHDOG_ENABLED; |
58687acba lockup_detector: ... |
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { |
195daf665 watchdog: enable ... |
130 |
watchdog_enabled = 0; |
58687acba lockup_detector: ... |
131 132 133 |
return 1; } __setup("nowatchdog", nowatchdog_setup); |
58687acba lockup_detector: ... |
134 135 |
static int __init nosoftlockup_setup(char *str) { |
195daf665 watchdog: enable ... |
136 |
watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; |
58687acba lockup_detector: ... |
137 138 139 |
return 1; } __setup("nosoftlockup", nosoftlockup_setup); |
195daf665 watchdog: enable ... |
140 |
|
ed235875e kernel/watchdog.c... |
141 142 143 144 145 146 147 148 149 |
#ifdef CONFIG_SMP static int __init softlockup_all_cpu_backtrace_setup(char *str) { sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); #endif |
58687acba lockup_detector: ... |
150 |
|
4eec42f39 watchdog: Change ... |
151 152 153 154 155 156 157 |
/* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally * want a higher threshold for soft lockups than for hard lockups. So we couple * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ |
6e9101aee watchdog: Fix non... |
158 |
static int get_softlockup_thresh(void) |
4eec42f39 watchdog: Change ... |
159 160 161 |
{ return watchdog_thresh * 2; } |
58687acba lockup_detector: ... |
162 163 164 165 166 167 |
/* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ |
c06b4f194 watchdog: Use loc... |
168 |
static unsigned long get_timestamp(void) |
58687acba lockup_detector: ... |
169 |
{ |
545a2bf74 kernel/sched/cloc... |
170 |
return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ |
58687acba lockup_detector: ... |
171 |
} |
0f34c4009 watchdog: store t... |
172 |
static void set_sample_period(void) |
58687acba lockup_detector: ... |
173 174 |
{ /* |
586692a5a watchdog: Disable... |
175 |
* convert watchdog_thresh from seconds to ns |
86f5e6a7b watchdog: Fix cod... |
176 177 178 179 |
* the divide by 5 is to give hrtimer several chances (two * or three with the current relation between the soft * and hard thresholds) to increment before the * hardlockup detector generates a warning |
58687acba lockup_detector: ... |
180 |
*/ |
0f34c4009 watchdog: store t... |
181 |
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); |
58687acba lockup_detector: ... |
182 183 184 185 186 |
} /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { |
c06b4f194 watchdog: Use loc... |
187 |
__this_cpu_write(watchdog_touch_ts, get_timestamp()); |
58687acba lockup_detector: ... |
188 |
} |
332fbdbca lockup_detector: ... |
189 |
void touch_softlockup_watchdog(void) |
58687acba lockup_detector: ... |
190 |
{ |
7861144b8 kernel/watchdog.c... |
191 192 193 194 195 |
/* * Preemption can be enabled. It doesn't matter which CPU's timestamp * gets zeroed here, so use the raw_ operation. */ raw_cpu_write(watchdog_touch_ts, 0); |
58687acba lockup_detector: ... |
196 |
} |
0167c7819 watchdog: Export ... |
197 |
EXPORT_SYMBOL(touch_softlockup_watchdog); |
58687acba lockup_detector: ... |
198 |
|
332fbdbca lockup_detector: ... |
199 |
void touch_all_softlockup_watchdogs(void) |
58687acba lockup_detector: ... |
200 201 202 203 204 205 206 207 |
{ int cpu; /* * this is done lockless * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ |
fe4ba3c34 watchdog: add wat... |
208 |
for_each_watchdog_cpu(cpu) |
58687acba lockup_detector: ... |
209 210 |
per_cpu(watchdog_touch_ts, cpu) = 0; } |
cafcd80d2 lockup_detector: ... |
211 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
212 213 |
void touch_nmi_watchdog(void) { |
62572e29b kernel/watchdog.c... |
214 215 216 217 218 219 220 |
/* * Using __raw here because some code paths have * preemption enabled. If preemption is enabled * then interrupts should be enabled too, in which * case we shouldn't have to worry about the watchdog * going off. */ |
f7f66b05a watchdog: Replace... |
221 |
raw_cpu_write(watchdog_nmi_touch, true); |
332fbdbca lockup_detector: ... |
222 |
touch_softlockup_watchdog(); |
58687acba lockup_detector: ... |
223 224 |
} EXPORT_SYMBOL(touch_nmi_watchdog); |
cafcd80d2 lockup_detector: ... |
225 |
#endif |
58687acba lockup_detector: ... |
226 227 |
void touch_softlockup_watchdog_sync(void) { |
f7f66b05a watchdog: Replace... |
228 229 |
__this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); |
58687acba lockup_detector: ... |
230 |
} |
23637d477 lockup_detector: ... |
231 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
58687acba lockup_detector: ... |
232 |
/* watchdog detector functions */ |
26e09c6ee lockup_detector: ... |
233 |
static int is_hardlockup(void) |
58687acba lockup_detector: ... |
234 |
{ |
909ea9646 core: Replace __g... |
235 |
unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
58687acba lockup_detector: ... |
236 |
|
909ea9646 core: Replace __g... |
237 |
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
58687acba lockup_detector: ... |
238 |
return 1; |
909ea9646 core: Replace __g... |
239 |
__this_cpu_write(hrtimer_interrupts_saved, hrint); |
58687acba lockup_detector: ... |
240 241 242 |
return 0; } #endif |
26e09c6ee lockup_detector: ... |
243 |
static int is_softlockup(unsigned long touch_ts) |
58687acba lockup_detector: ... |
244 |
{ |
c06b4f194 watchdog: Use loc... |
245 |
unsigned long now = get_timestamp(); |
58687acba lockup_detector: ... |
246 |
|
195daf665 watchdog: enable ... |
247 248 249 250 251 |
if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { /* Warn about unreasonable delays. */ if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; } |
58687acba lockup_detector: ... |
252 253 |
return 0; } |
23637d477 lockup_detector: ... |
254 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
1880c4ae1 perf, x86: Add hw... |
255 |
|
58687acba lockup_detector: ... |
256 257 258 259 260 261 262 263 264 |
static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .size = sizeof(struct perf_event_attr), .pinned = 1, .disabled = 1, }; /* Callback function for perf event subsystem */ |
a8b0ca17b perf: Remove the ... |
265 |
static void watchdog_overflow_callback(struct perf_event *event, |
58687acba lockup_detector: ... |
266 267 268 |
struct perf_sample_data *data, struct pt_regs *regs) { |
c6db67cda watchdog: Don't t... |
269 270 |
/* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; |
909ea9646 core: Replace __g... |
271 272 |
if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); |
58687acba lockup_detector: ... |
273 274 275 276 277 278 279 280 281 |
return; } /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ |
26e09c6ee lockup_detector: ... |
282 283 |
if (is_hardlockup()) { int this_cpu = smp_processor_id(); |
58687acba lockup_detector: ... |
284 |
/* only print hardlockups once */ |
909ea9646 core: Replace __g... |
285 |
if (__this_cpu_read(hard_watchdog_warn) == true) |
58687acba lockup_detector: ... |
286 287 288 |
return; if (hardlockup_panic) |
656c3b79f kernel/watchdog.c... |
289 290 |
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
58687acba lockup_detector: ... |
291 |
else |
656c3b79f kernel/watchdog.c... |
292 293 |
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
58687acba lockup_detector: ... |
294 |
|
909ea9646 core: Replace __g... |
295 |
__this_cpu_write(hard_watchdog_warn, true); |
58687acba lockup_detector: ... |
296 297 |
return; } |
909ea9646 core: Replace __g... |
298 |
__this_cpu_write(hard_watchdog_warn, false); |
58687acba lockup_detector: ... |
299 300 |
return; } |
bcd951cf1 watchdog: Use hot... |
301 |
#endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
58687acba lockup_detector: ... |
302 303 |
static void watchdog_interrupt_count(void) { |
909ea9646 core: Replace __g... |
304 |
__this_cpu_inc(hrtimer_interrupts); |
58687acba lockup_detector: ... |
305 |
} |
bcd951cf1 watchdog: Use hot... |
306 307 308 |
static int watchdog_nmi_enable(unsigned int cpu); static void watchdog_nmi_disable(unsigned int cpu); |
58687acba lockup_detector: ... |
309 310 311 312 |
/* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { |
909ea9646 core: Replace __g... |
313 |
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
58687acba lockup_detector: ... |
314 315 |
struct pt_regs *regs = get_irq_regs(); int duration; |
ed235875e kernel/watchdog.c... |
316 |
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; |
58687acba lockup_detector: ... |
317 318 319 320 321 |
/* kick the hardlockup detector */ watchdog_interrupt_count(); /* kick the softlockup detector */ |
909ea9646 core: Replace __g... |
322 |
wake_up_process(__this_cpu_read(softlockup_watchdog)); |
58687acba lockup_detector: ... |
323 324 |
/* .. and repeat */ |
0f34c4009 watchdog: store t... |
325 |
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); |
58687acba lockup_detector: ... |
326 327 |
if (touch_ts == 0) { |
909ea9646 core: Replace __g... |
328 |
if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
58687acba lockup_detector: ... |
329 330 331 332 |
/* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ |
909ea9646 core: Replace __g... |
333 |
__this_cpu_write(softlockup_touch_sync, false); |
58687acba lockup_detector: ... |
334 335 |
sched_clock_tick(); } |
5d1c0f4a8 watchdog: add che... |
336 337 338 |
/* Clear the guest paused flag on watchdog reset */ kvm_check_and_clear_guest_paused(); |
58687acba lockup_detector: ... |
339 340 341 342 343 344 345 346 347 348 |
__touch_watchdog(); return HRTIMER_RESTART; } /* check for a softlockup * This is done by making sure a high priority task is * being scheduled. The task touches the watchdog to * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ |
26e09c6ee lockup_detector: ... |
349 |
duration = is_softlockup(touch_ts); |
58687acba lockup_detector: ... |
350 |
if (unlikely(duration)) { |
5d1c0f4a8 watchdog: add che... |
351 352 353 354 355 356 357 |
/* * If a virtual machine is stopped by the host it can look to * the watchdog like a soft lockup, check to see if the host * stopped the vm before we issue the warning */ if (kvm_check_and_clear_guest_paused()) return HRTIMER_RESTART; |
58687acba lockup_detector: ... |
358 |
/* only warn once */ |
b1a8de1f5 softlockup: make ... |
359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
if (__this_cpu_read(soft_watchdog_warn) == true) { /* * When multiple processes are causing softlockups the * softlockup detector only warns on the first one * because the code relies on a full quiet cycle to * re-arm. The second process prevents the quiet cycle * and never gets reported. Use task pointers to detect * this. */ if (__this_cpu_read(softlockup_task_ptr_saved) != current) { __this_cpu_write(soft_watchdog_warn, false); __touch_watchdog(); } |
58687acba lockup_detector: ... |
373 |
return HRTIMER_RESTART; |
b1a8de1f5 softlockup: make ... |
374 |
} |
58687acba lockup_detector: ... |
375 |
|
ed235875e kernel/watchdog.c... |
376 377 378 379 380 381 382 383 384 385 |
if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping cpu back traces */ if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { /* Someone else will report us. Let's give up */ __this_cpu_write(soft_watchdog_warn, true); return HRTIMER_RESTART; } } |
656c3b79f kernel/watchdog.c... |
386 387 |
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d] ", |
26e09c6ee lockup_detector: ... |
388 |
smp_processor_id(), duration, |
58687acba lockup_detector: ... |
389 |
current->comm, task_pid_nr(current)); |
b1a8de1f5 softlockup: make ... |
390 |
__this_cpu_write(softlockup_task_ptr_saved, current); |
58687acba lockup_detector: ... |
391 392 393 394 395 396 |
print_modules(); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); |
ed235875e kernel/watchdog.c... |
397 398 399 400 401 402 403 404 405 406 |
if (softlockup_all_cpu_backtrace) { /* Avoid generating two back traces for current * given that one is already made above */ trigger_allbutself_cpu_backtrace(); clear_bit(0, &soft_lockup_nmi_warn); /* Barrier to sync with other cpus */ smp_mb__after_atomic(); } |
69361eef9 panic: add TAINT_... |
407 |
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); |
58687acba lockup_detector: ... |
408 409 |
if (softlockup_panic) panic("softlockup: hung tasks"); |
909ea9646 core: Replace __g... |
410 |
__this_cpu_write(soft_watchdog_warn, true); |
58687acba lockup_detector: ... |
411 |
} else |
909ea9646 core: Replace __g... |
412 |
__this_cpu_write(soft_watchdog_warn, false); |
58687acba lockup_detector: ... |
413 414 415 |
return HRTIMER_RESTART; } |
bcd951cf1 watchdog: Use hot... |
416 417 418 |
static void watchdog_set_prio(unsigned int policy, unsigned int prio) { struct sched_param param = { .sched_priority = prio }; |
58687acba lockup_detector: ... |
419 |
|
bcd951cf1 watchdog: Use hot... |
420 421 422 423 |
sched_setscheduler(current, policy, ¶m); } static void watchdog_enable(unsigned int cpu) |
58687acba lockup_detector: ... |
424 |
{ |
f7f66b05a watchdog: Replace... |
425 |
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
58687acba lockup_detector: ... |
426 |
|
3935e8950 watchdog: Fix dis... |
427 428 429 |
/* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; |
bcd951cf1 watchdog: Use hot... |
430 431 |
/* Enable the perf event */ watchdog_nmi_enable(cpu); |
58687acba lockup_detector: ... |
432 |
|
58687acba lockup_detector: ... |
433 |
/* done here because hrtimer_start can only pin to smp_processor_id() */ |
0f34c4009 watchdog: store t... |
434 |
hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
58687acba lockup_detector: ... |
435 |
HRTIMER_MODE_REL_PINNED); |
bcd951cf1 watchdog: Use hot... |
436 437 438 439 |
/* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); __touch_watchdog(); } |
58687acba lockup_detector: ... |
440 |
|
bcd951cf1 watchdog: Use hot... |
441 442 |
static void watchdog_disable(unsigned int cpu) { |
f7f66b05a watchdog: Replace... |
443 |
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
58687acba lockup_detector: ... |
444 |
|
bcd951cf1 watchdog: Use hot... |
445 446 447 448 |
watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ watchdog_nmi_disable(cpu); |
58687acba lockup_detector: ... |
449 |
} |
b8900bc02 watchdog: Registe... |
450 451 452 453 |
static void watchdog_cleanup(unsigned int cpu, bool online) { watchdog_disable(cpu); } |
bcd951cf1 watchdog: Use hot... |
454 455 456 457 458 459 460 461 462 |
static int watchdog_should_run(unsigned int cpu) { return __this_cpu_read(hrtimer_interrupts) != __this_cpu_read(soft_lockup_hrtimer_cnt); } /* * The watchdog thread function - touches the timestamp. * |
0f34c4009 watchdog: store t... |
463 |
* It only runs once every sample_period seconds (4 seconds by |
bcd951cf1 watchdog: Use hot... |
464 465 466 467 468 469 470 471 472 |
* default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). */ static void watchdog(unsigned int cpu) { __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); |
bcfba4f4b watchdog: impleme... |
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 |
/* * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the * failure path. Check for failures that can occur asynchronously - * for example, when CPUs are on-lined - and shut down the hardware * perf event on each CPU accordingly. * * The only non-obvious place this bit can be cleared is through * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a * pr_info here would be too noisy as it would result in a message * every few seconds if the hardlockup was disabled but the softlockup * enabled. */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) watchdog_nmi_disable(cpu); |
bcd951cf1 watchdog: Use hot... |
488 |
} |
58687acba lockup_detector: ... |
489 |
|
23637d477 lockup_detector: ... |
490 |
#ifdef CONFIG_HARDLOCKUP_DETECTOR |
a70270468 watchdog: Quiet d... |
491 492 493 494 495 496 |
/* * People like the simple clean cpu node info on boot. * Reduce the watchdog noise by only printing messages * that are different from what cpu0 displayed. */ static unsigned long cpu0_err; |
bcd951cf1 watchdog: Use hot... |
497 |
static int watchdog_nmi_enable(unsigned int cpu) |
58687acba lockup_detector: ... |
498 499 500 |
{ struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); |
195daf665 watchdog: enable ... |
501 502 503 |
/* nothing to do if the hard lockup detector is disabled */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; |
6e7458a6f kernel/watchdog.c... |
504 |
|
58687acba lockup_detector: ... |
505 506 507 508 509 510 511 |
/* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; /* it is setup but not enabled */ if (event != NULL) goto out_enable; |
58687acba lockup_detector: ... |
512 |
wd_attr = &wd_hw_attr; |
4eec42f39 watchdog: Change ... |
513 |
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
1880c4ae1 perf, x86: Add hw... |
514 515 |
/* Try to register using hardware perf events */ |
4dc0da869 perf: Add context... |
516 |
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
a70270468 watchdog: Quiet d... |
517 518 519 520 |
/* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); |
58687acba lockup_detector: ... |
521 |
if (!IS_ERR(event)) { |
a70270468 watchdog: Quiet d... |
522 523 524 525 |
/* only print for cpu0 or different than cpu0 */ if (cpu == 0 || cpu0_err) pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter. "); |
58687acba lockup_detector: ... |
526 527 |
goto out_save; } |
bcfba4f4b watchdog: impleme... |
528 529 530 531 532 533 534 535 536 537 538 |
/* * Disable the hard lockup detector if _any_ CPU fails to set up * set up the hardware perf event. The watchdog() function checks * the NMI_WATCHDOG_ENABLED bit periodically. * * The barriers are for syncing up watchdog_enabled across all the * cpus, as clear_bit() does not use barriers. */ smp_mb__before_atomic(); clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); smp_mb__after_atomic(); |
a70270468 watchdog: Quiet d... |
539 540 541 |
/* skip displaying the same error again */ if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) return PTR_ERR(event); |
5651f7f47 watchdog, nmi: Lo... |
542 543 544 |
/* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) |
4501980aa kernel/watchdog.c... |
545 546 |
pr_info("disabled (cpu%i): not supported (no LAPIC?) ", cpu); |
5651f7f47 watchdog, nmi: Lo... |
547 |
else if (PTR_ERR(event) == -ENOENT) |
656c3b79f kernel/watchdog.c... |
548 549 |
pr_warn("disabled (cpu%i): hardware events not enabled ", |
4501980aa kernel/watchdog.c... |
550 |
cpu); |
5651f7f47 watchdog, nmi: Lo... |
551 |
else |
4501980aa kernel/watchdog.c... |
552 553 554 |
pr_err("disabled (cpu%i): unable to create perf event: %ld ", cpu, PTR_ERR(event)); |
bcfba4f4b watchdog: impleme... |
555 556 557 |
pr_info("Shutting down hard lockup detector on all cpus "); |
eac243355 lockup_detector: ... |
558 |
return PTR_ERR(event); |
58687acba lockup_detector: ... |
559 560 561 562 563 564 565 566 567 |
/* success path */ out_save: per_cpu(watchdog_ev, cpu) = event; out_enable: perf_event_enable(per_cpu(watchdog_ev, cpu)); out: return 0; } |
bcd951cf1 watchdog: Use hot... |
568 |
static void watchdog_nmi_disable(unsigned int cpu) |
58687acba lockup_detector: ... |
569 570 571 572 573 574 575 576 577 578 |
{ struct perf_event *event = per_cpu(watchdog_ev, cpu); if (event) { perf_event_disable(event); per_cpu(watchdog_ev, cpu) = NULL; /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); } |
df5771495 watchdog: Fix pri... |
579 580 581 582 |
if (cpu == 0) { /* watchdog_nmi_enable() expects this to be zero initially. */ cpu0_err = 0; } |
58687acba lockup_detector: ... |
583 |
} |
b3738d293 watchdog: Add wat... |
584 585 586 587 |
void watchdog_nmi_enable_all(void) { int cpu; |
ab992dc38 watchdog: Fix mer... |
588 589 590 591 |
mutex_lock(&watchdog_proc_mutex); if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto unlock; |
b3738d293 watchdog: Add wat... |
592 593 |
get_online_cpus(); |
fe4ba3c34 watchdog: add wat... |
594 |
for_each_watchdog_cpu(cpu) |
b3738d293 watchdog: Add wat... |
595 596 |
watchdog_nmi_enable(cpu); put_online_cpus(); |
ab992dc38 watchdog: Fix mer... |
597 598 |
unlock: |
1173ff09b watchdog: fix dou... |
599 |
mutex_unlock(&watchdog_proc_mutex); |
b3738d293 watchdog: Add wat... |
600 601 602 603 604 |
} void watchdog_nmi_disable_all(void) { int cpu; |
ab992dc38 watchdog: Fix mer... |
605 |
mutex_lock(&watchdog_proc_mutex); |
b3738d293 watchdog: Add wat... |
606 |
if (!watchdog_running) |
ab992dc38 watchdog: Fix mer... |
607 |
goto unlock; |
b3738d293 watchdog: Add wat... |
608 609 |
get_online_cpus(); |
fe4ba3c34 watchdog: add wat... |
610 |
for_each_watchdog_cpu(cpu) |
b3738d293 watchdog: Add wat... |
611 612 |
watchdog_nmi_disable(cpu); put_online_cpus(); |
ab992dc38 watchdog: Fix mer... |
613 614 615 |
unlock: mutex_unlock(&watchdog_proc_mutex); |
b3738d293 watchdog: Add wat... |
616 |
} |
58687acba lockup_detector: ... |
617 |
#else |
bcd951cf1 watchdog: Use hot... |
618 619 |
static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } |
b3738d293 watchdog: Add wat... |
620 621 |
void watchdog_nmi_enable_all(void) {} void watchdog_nmi_disable_all(void) {} |
23637d477 lockup_detector: ... |
622 |
#endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
58687acba lockup_detector: ... |
623 |
|
b8900bc02 watchdog: Registe... |
624 625 626 627 628 629 630 631 632 633 |
static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, .thread_fn = watchdog, .thread_comm = "watchdog/%u", .setup = watchdog_enable, .cleanup = watchdog_cleanup, .park = watchdog_disable, .unpark = watchdog_enable, }; |
9809b18fc watchdog: update ... |
634 635 |
static void restart_watchdog_hrtimer(void *info) { |
f7f66b05a watchdog: Replace... |
636 |
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
9809b18fc watchdog: update ... |
637 638 639 640 641 642 643 644 645 646 647 648 649 |
int ret; /* * No need to cancel and restart hrtimer if it is currently executing * because it will reprogram itself with the new period now. * We should never see it unqueued here because we are running per-cpu * with interrupts disabled. */ ret = hrtimer_try_to_cancel(hrtimer); if (ret == 1) hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); } |
b2f57c3a0 watchdog: clean u... |
650 |
static void update_watchdog(int cpu) |
9809b18fc watchdog: update ... |
651 |
{ |
9809b18fc watchdog: update ... |
652 653 654 655 656 657 658 659 660 |
/* * Make sure that perf event counter will adopt to a new * sampling period. Updating the sampling period directly would * be much nicer but we do not have an API for that now so * let's use a big hammer. * Hrtimer will adopt the new period on the next tick but this * might be late already so we have to restart the timer as well. */ watchdog_nmi_disable(cpu); |
e0a23b062 watchdog: Simplif... |
661 |
smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); |
9809b18fc watchdog: update ... |
662 663 |
watchdog_nmi_enable(cpu); } |
b2f57c3a0 watchdog: clean u... |
664 |
static void update_watchdog_all_cpus(void) |
9809b18fc watchdog: update ... |
665 666 667 668 |
{ int cpu; get_online_cpus(); |
fe4ba3c34 watchdog: add wat... |
669 |
for_each_watchdog_cpu(cpu) |
b2f57c3a0 watchdog: clean u... |
670 |
update_watchdog(cpu); |
9809b18fc watchdog: update ... |
671 672 |
put_online_cpus(); } |
b2f57c3a0 watchdog: clean u... |
673 |
static int watchdog_enable_all_cpus(void) |
58687acba lockup_detector: ... |
674 |
{ |
b8900bc02 watchdog: Registe... |
675 |
int err = 0; |
58687acba lockup_detector: ... |
676 |
|
3c00ea82c watchdog: Rename ... |
677 |
if (!watchdog_running) { |
b8900bc02 watchdog: Registe... |
678 679 680 681 |
err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled "); |
fe4ba3c34 watchdog: add wat... |
682 683 684 685 686 |
else { if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask)) pr_err("Failed to set cpumask for watchdog threads "); |
3c00ea82c watchdog: Rename ... |
687 |
watchdog_running = 1; |
fe4ba3c34 watchdog: add wat... |
688 |
} |
b2f57c3a0 watchdog: clean u... |
689 690 691 692 693 694 |
} else { /* * Enable/disable the lockup detectors or * change the sample period 'on the fly'. */ update_watchdog_all_cpus(); |
bcd951cf1 watchdog: Use hot... |
695 |
} |
b8900bc02 watchdog: Registe... |
696 697 |
return err; |
58687acba lockup_detector: ... |
698 |
} |
b8900bc02 watchdog: Registe... |
699 700 701 |
/* prepare/enable/disable routines */ /* sysctl functions */ #ifdef CONFIG_SYSCTL |
58687acba lockup_detector: ... |
702 703 |
static void watchdog_disable_all_cpus(void) { |
3c00ea82c watchdog: Rename ... |
704 705 |
if (watchdog_running) { watchdog_running = 0; |
b8900bc02 watchdog: Registe... |
706 |
smpboot_unregister_percpu_thread(&watchdog_threads); |
bcd951cf1 watchdog: Use hot... |
707 |
} |
58687acba lockup_detector: ... |
708 |
} |
58687acba lockup_detector: ... |
709 |
/* |
a0c9cbb93 watchdog: introdu... |
710 711 712 713 714 715 716 717 718 719 720 721 722 723 |
* Update the run state of the lockup detectors. */ static int proc_watchdog_update(void) { int err = 0; /* * Watchdog threads won't be started if they are already active. * The 'watchdog_running' variable in watchdog_*_all_cpus() takes * care of this. If those threads are already active, the sample * period will be updated and the lockup detectors will be enabled * or disabled 'on the fly'. */ if (watchdog_enabled && watchdog_thresh) |
b2f57c3a0 watchdog: clean u... |
724 |
err = watchdog_enable_all_cpus(); |
a0c9cbb93 watchdog: introdu... |
725 726 727 728 729 730 731 732 |
else watchdog_disable_all_cpus(); return err; } /* |
ef246a216 watchdog: introdu... |
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 |
* common function for watchdog, nmi_watchdog and soft_watchdog parameter * * caller | table->data points to | 'which' contains the flag(s) * -------------------|-----------------------|----------------------------- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed * | | with SOFT_WATCHDOG_ENABLED * -------------------|-----------------------|----------------------------- * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED * -------------------|-----------------------|----------------------------- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err, old, new; int *watchdog_param = (int *)table->data; mutex_lock(&watchdog_proc_mutex); /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the * run state of the lockup detectors. */ if (!write) { *watchdog_param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err) goto out; /* * There is a race window between fetching the current value * from 'watchdog_enabled' and storing the new value. During * this race window, watchdog_nmi_enable() can sneak in and * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. * The 'cmpxchg' detects this race and the loop retries. */ do { old = watchdog_enabled; /* * If the parameter value is not zero set the * corresponding bit(s), else clear it(them). */ if (*watchdog_param) new = old | which; else new = old & ~which; } while (cmpxchg(&watchdog_enabled, old, new) != old); /* * Update the run state of the lockup detectors. * Restore 'watchdog_enabled' on failure. */ err = proc_watchdog_update(); if (err) watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); return err; } /* |
83a80a390 watchdog: introdu... |
798 799 800 801 802 803 804 805 806 807 808 |
* /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } /* * /proc/sys/kernel/nmi_watchdog |
58687acba lockup_detector: ... |
809 |
*/ |
83a80a390 watchdog: introdu... |
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 |
int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } /* * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } |
58687acba lockup_detector: ... |
826 |
|
83a80a390 watchdog: introdu... |
827 828 829 830 831 |
/* * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) |
58687acba lockup_detector: ... |
832 |
{ |
83a80a390 watchdog: introdu... |
833 |
int err, old; |
58687acba lockup_detector: ... |
834 |
|
359e6fab6 watchdog: update ... |
835 |
mutex_lock(&watchdog_proc_mutex); |
bcd951cf1 watchdog: Use hot... |
836 |
|
83a80a390 watchdog: introdu... |
837 |
old = ACCESS_ONCE(watchdog_thresh); |
b8900bc02 watchdog: Registe... |
838 |
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
83a80a390 watchdog: introdu... |
839 |
|
b8900bc02 watchdog: Registe... |
840 |
if (err || !write) |
359e6fab6 watchdog: update ... |
841 |
goto out; |
e04ab2bc4 watchdog: Only di... |
842 |
|
b66a2356d watchdog: Add com... |
843 |
/* |
83a80a390 watchdog: introdu... |
844 845 |
* Update the sample period. * Restore 'watchdog_thresh' on failure. |
b66a2356d watchdog: Add com... |
846 |
*/ |
83a80a390 watchdog: introdu... |
847 848 849 850 |
set_sample_period(); err = proc_watchdog_update(); if (err) watchdog_thresh = old; |
359e6fab6 watchdog: update ... |
851 852 |
out: mutex_unlock(&watchdog_proc_mutex); |
b8900bc02 watchdog: Registe... |
853 |
return err; |
58687acba lockup_detector: ... |
854 |
} |
fe4ba3c34 watchdog: add wat... |
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 |
/* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err; mutex_lock(&watchdog_proc_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); if (watchdog_running) { /* * Failure would be due to being unable to allocate * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed "); } } mutex_unlock(&watchdog_proc_mutex); return err; } |
58687acba lockup_detector: ... |
889 |
#endif /* CONFIG_SYSCTL */ |
004417a6d perf, arch: Clean... |
890 |
void __init lockup_detector_init(void) |
58687acba lockup_detector: ... |
891 |
{ |
0f34c4009 watchdog: store t... |
892 |
set_sample_period(); |
b8900bc02 watchdog: Registe... |
893 |
|
fe4ba3c34 watchdog: add wat... |
894 895 896 897 898 899 900 901 902 903 904 905 |
#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { if (!cpumask_empty(tick_nohz_full_mask)) pr_info("Disabling watchdog on nohz_full cores by default "); cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, tick_nohz_full_mask); } else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif |
195daf665 watchdog: enable ... |
906 |
if (watchdog_enabled) |
b2f57c3a0 watchdog: clean u... |
907 |
watchdog_enable_all_cpus(); |
58687acba lockup_detector: ... |
908 |
} |