Commit 1142d810298e694754498dbb4983fcb6cb7fd884
1 parent
99bd5e2f24
Exists in
master
and in
7 other branches
cpu_stop: implement stop_cpu[s]()
Implement a simplistic per-cpu maximum priority cpu monopolization mechanism. A non-sleeping callback can be scheduled to run on one or multiple cpus with maximum priority monopolozing those cpus. This is primarily to replace and unify RT workqueue usage in stop_machine and scheduler migration_thread which currently is serving multiple purposes. Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(), stop_cpus() and try_stop_cpus(). This is to allow clean sharing of resources among stop_cpu and all the migration thread users. One stopper thread per cpu is created which is currently named "stopper/CPU". This will eventually replace the migration thread and take on its name. * This facility was originally named cpuhog and lived in separate files but Peter Zijlstra nacked the name and thus got renamed to cpu_stop and moved into stop_machine.c. * Better reporting of preemption leak as per Peter's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Dimitri Sivanich <sivanich@sgi.com>
Showing 2 changed files with 402 additions and 9 deletions Side-by-side Diff
include/linux/stop_machine.h
1 | 1 | #ifndef _LINUX_STOP_MACHINE |
2 | 2 | #define _LINUX_STOP_MACHINE |
3 | -/* "Bogolock": stop the entire machine, disable interrupts. This is a | |
4 | - very heavy lock, which is equivalent to grabbing every spinlock | |
5 | - (and more). So the "read" side to such a lock is anything which | |
6 | - disables preeempt. */ | |
3 | + | |
7 | 4 | #include <linux/cpu.h> |
8 | 5 | #include <linux/cpumask.h> |
6 | +#include <linux/list.h> | |
9 | 7 | #include <asm/system.h> |
10 | 8 | |
11 | 9 | #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) |
10 | + | |
11 | +/* | |
12 | + * stop_cpu[s]() is simplistic per-cpu maximum priority cpu | |
13 | + * monopolization mechanism. The caller can specify a non-sleeping | |
14 | + * function to be executed on a single or multiple cpus preempting all | |
15 | + * other processes and monopolizing those cpus until it finishes. | |
16 | + * | |
17 | + * Resources for this mechanism are preallocated when a cpu is brought | |
18 | + * up and requests are guaranteed to be served as long as the target | |
19 | + * cpus are online. | |
20 | + */ | |
21 | + | |
22 | +typedef int (*cpu_stop_fn_t)(void *arg); | |
23 | + | |
24 | +struct cpu_stop_work { | |
25 | + struct list_head list; /* cpu_stopper->works */ | |
26 | + cpu_stop_fn_t fn; | |
27 | + void *arg; | |
28 | + struct cpu_stop_done *done; | |
29 | +}; | |
30 | + | |
31 | +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); | |
32 | +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |
33 | + struct cpu_stop_work *work_buf); | |
34 | +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | |
35 | +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | |
36 | + | |
37 | +/* | |
38 | + * stop_machine "Bogolock": stop the entire machine, disable | |
39 | + * interrupts. This is a very heavy lock, which is equivalent to | |
40 | + * grabbing every spinlock (and more). So the "read" side to such a | |
41 | + * lock is anything which disables preeempt. | |
42 | + */ | |
12 | 43 | |
13 | 44 | /** |
14 | 45 | * stop_machine: freeze the machine on all CPUs and run this function |
kernel/stop_machine.c
1 | -/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. | |
2 | - * GPL v2 and any later version. | |
1 | +/* | |
2 | + * kernel/stop_machine.c | |
3 | + * | |
4 | + * Copyright (C) 2008, 2005 IBM Corporation. | |
5 | + * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au | |
6 | + * Copyright (C) 2010 SUSE Linux Products GmbH | |
7 | + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> | |
8 | + * | |
9 | + * This file is released under the GPLv2 and any later version. | |
3 | 10 | */ |
11 | +#include <linux/completion.h> | |
4 | 12 | #include <linux/cpu.h> |
5 | -#include <linux/err.h> | |
13 | +#include <linux/init.h> | |
6 | 14 | #include <linux/kthread.h> |
7 | 15 | #include <linux/module.h> |
16 | +#include <linux/percpu.h> | |
8 | 17 | #include <linux/sched.h> |
9 | 18 | #include <linux/stop_machine.h> |
10 | -#include <linux/syscalls.h> | |
11 | 19 | #include <linux/interrupt.h> |
20 | +#include <linux/kallsyms.h> | |
12 | 21 | |
13 | 22 | #include <asm/atomic.h> |
14 | -#include <asm/uaccess.h> | |
23 | + | |
24 | +/* | |
25 | + * Structure to determine completion condition and record errors. May | |
26 | + * be shared by works on different cpus. | |
27 | + */ | |
28 | +struct cpu_stop_done { | |
29 | + atomic_t nr_todo; /* nr left to execute */ | |
30 | + bool executed; /* actually executed? */ | |
31 | + int ret; /* collected return value */ | |
32 | + struct completion completion; /* fired if nr_todo reaches 0 */ | |
33 | +}; | |
34 | + | |
35 | +/* the actual stopper, one per every possible cpu, enabled on online cpus */ | |
36 | +struct cpu_stopper { | |
37 | + spinlock_t lock; | |
38 | + struct list_head works; /* list of pending works */ | |
39 | + struct task_struct *thread; /* stopper thread */ | |
40 | + bool enabled; /* is this stopper enabled? */ | |
41 | +}; | |
42 | + | |
43 | +static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | |
44 | + | |
45 | +static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | |
46 | +{ | |
47 | + memset(done, 0, sizeof(*done)); | |
48 | + atomic_set(&done->nr_todo, nr_todo); | |
49 | + init_completion(&done->completion); | |
50 | +} | |
51 | + | |
52 | +/* signal completion unless @done is NULL */ | |
53 | +static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | |
54 | +{ | |
55 | + if (done) { | |
56 | + if (executed) | |
57 | + done->executed = true; | |
58 | + if (atomic_dec_and_test(&done->nr_todo)) | |
59 | + complete(&done->completion); | |
60 | + } | |
61 | +} | |
62 | + | |
63 | +/* queue @work to @stopper. if offline, @work is completed immediately */ | |
64 | +static void cpu_stop_queue_work(struct cpu_stopper *stopper, | |
65 | + struct cpu_stop_work *work) | |
66 | +{ | |
67 | + unsigned long flags; | |
68 | + | |
69 | + spin_lock_irqsave(&stopper->lock, flags); | |
70 | + | |
71 | + if (stopper->enabled) { | |
72 | + list_add_tail(&work->list, &stopper->works); | |
73 | + wake_up_process(stopper->thread); | |
74 | + } else | |
75 | + cpu_stop_signal_done(work->done, false); | |
76 | + | |
77 | + spin_unlock_irqrestore(&stopper->lock, flags); | |
78 | +} | |
79 | + | |
80 | +/** | |
81 | + * stop_one_cpu - stop a cpu | |
82 | + * @cpu: cpu to stop | |
83 | + * @fn: function to execute | |
84 | + * @arg: argument to @fn | |
85 | + * | |
86 | + * Execute @fn(@arg) on @cpu. @fn is run in a process context with | |
87 | + * the highest priority preempting any task on the cpu and | |
88 | + * monopolizing it. This function returns after the execution is | |
89 | + * complete. | |
90 | + * | |
91 | + * This function doesn't guarantee @cpu stays online till @fn | |
92 | + * completes. If @cpu goes down in the middle, execution may happen | |
93 | + * partially or fully on different cpus. @fn should either be ready | |
94 | + * for that or the caller should ensure that @cpu stays online until | |
95 | + * this function completes. | |
96 | + * | |
97 | + * CONTEXT: | |
98 | + * Might sleep. | |
99 | + * | |
100 | + * RETURNS: | |
101 | + * -ENOENT if @fn(@arg) was not executed because @cpu was offline; | |
102 | + * otherwise, the return value of @fn. | |
103 | + */ | |
104 | +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |
105 | +{ | |
106 | + struct cpu_stop_done done; | |
107 | + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; | |
108 | + | |
109 | + cpu_stop_init_done(&done, 1); | |
110 | + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); | |
111 | + wait_for_completion(&done.completion); | |
112 | + return done.executed ? done.ret : -ENOENT; | |
113 | +} | |
114 | + | |
115 | +/** | |
116 | + * stop_one_cpu_nowait - stop a cpu but don't wait for completion | |
117 | + * @cpu: cpu to stop | |
118 | + * @fn: function to execute | |
119 | + * @arg: argument to @fn | |
120 | + * | |
121 | + * Similar to stop_one_cpu() but doesn't wait for completion. The | |
122 | + * caller is responsible for ensuring @work_buf is currently unused | |
123 | + * and will remain untouched until stopper starts executing @fn. | |
124 | + * | |
125 | + * CONTEXT: | |
126 | + * Don't care. | |
127 | + */ | |
128 | +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |
129 | + struct cpu_stop_work *work_buf) | |
130 | +{ | |
131 | + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; | |
132 | + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); | |
133 | +} | |
134 | + | |
135 | +/* static data for stop_cpus */ | |
136 | +static DEFINE_MUTEX(stop_cpus_mutex); | |
137 | +static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); | |
138 | + | |
139 | +int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |
140 | +{ | |
141 | + struct cpu_stop_work *work; | |
142 | + struct cpu_stop_done done; | |
143 | + unsigned int cpu; | |
144 | + | |
145 | + /* initialize works and done */ | |
146 | + for_each_cpu(cpu, cpumask) { | |
147 | + work = &per_cpu(stop_cpus_work, cpu); | |
148 | + work->fn = fn; | |
149 | + work->arg = arg; | |
150 | + work->done = &done; | |
151 | + } | |
152 | + cpu_stop_init_done(&done, cpumask_weight(cpumask)); | |
153 | + | |
154 | + /* | |
155 | + * Disable preemption while queueing to avoid getting | |
156 | + * preempted by a stopper which might wait for other stoppers | |
157 | + * to enter @fn which can lead to deadlock. | |
158 | + */ | |
159 | + preempt_disable(); | |
160 | + for_each_cpu(cpu, cpumask) | |
161 | + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), | |
162 | + &per_cpu(stop_cpus_work, cpu)); | |
163 | + preempt_enable(); | |
164 | + | |
165 | + wait_for_completion(&done.completion); | |
166 | + return done.executed ? done.ret : -ENOENT; | |
167 | +} | |
168 | + | |
169 | +/** | |
170 | + * stop_cpus - stop multiple cpus | |
171 | + * @cpumask: cpus to stop | |
172 | + * @fn: function to execute | |
173 | + * @arg: argument to @fn | |
174 | + * | |
175 | + * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, | |
176 | + * @fn is run in a process context with the highest priority | |
177 | + * preempting any task on the cpu and monopolizing it. This function | |
178 | + * returns after all executions are complete. | |
179 | + * | |
180 | + * This function doesn't guarantee the cpus in @cpumask stay online | |
181 | + * till @fn completes. If some cpus go down in the middle, execution | |
182 | + * on the cpu may happen partially or fully on different cpus. @fn | |
183 | + * should either be ready for that or the caller should ensure that | |
184 | + * the cpus stay online until this function completes. | |
185 | + * | |
186 | + * All stop_cpus() calls are serialized making it safe for @fn to wait | |
187 | + * for all cpus to start executing it. | |
188 | + * | |
189 | + * CONTEXT: | |
190 | + * Might sleep. | |
191 | + * | |
192 | + * RETURNS: | |
193 | + * -ENOENT if @fn(@arg) was not executed at all because all cpus in | |
194 | + * @cpumask were offline; otherwise, 0 if all executions of @fn | |
195 | + * returned 0, any non zero return value if any returned non zero. | |
196 | + */ | |
197 | +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |
198 | +{ | |
199 | + int ret; | |
200 | + | |
201 | + /* static works are used, process one request at a time */ | |
202 | + mutex_lock(&stop_cpus_mutex); | |
203 | + ret = __stop_cpus(cpumask, fn, arg); | |
204 | + mutex_unlock(&stop_cpus_mutex); | |
205 | + return ret; | |
206 | +} | |
207 | + | |
208 | +/** | |
209 | + * try_stop_cpus - try to stop multiple cpus | |
210 | + * @cpumask: cpus to stop | |
211 | + * @fn: function to execute | |
212 | + * @arg: argument to @fn | |
213 | + * | |
214 | + * Identical to stop_cpus() except that it fails with -EAGAIN if | |
215 | + * someone else is already using the facility. | |
216 | + * | |
217 | + * CONTEXT: | |
218 | + * Might sleep. | |
219 | + * | |
220 | + * RETURNS: | |
221 | + * -EAGAIN if someone else is already stopping cpus, -ENOENT if | |
222 | + * @fn(@arg) was not executed at all because all cpus in @cpumask were | |
223 | + * offline; otherwise, 0 if all executions of @fn returned 0, any non | |
224 | + * zero return value if any returned non zero. | |
225 | + */ | |
226 | +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |
227 | +{ | |
228 | + int ret; | |
229 | + | |
230 | + /* static works are used, process one request at a time */ | |
231 | + if (!mutex_trylock(&stop_cpus_mutex)) | |
232 | + return -EAGAIN; | |
233 | + ret = __stop_cpus(cpumask, fn, arg); | |
234 | + mutex_unlock(&stop_cpus_mutex); | |
235 | + return ret; | |
236 | +} | |
237 | + | |
238 | +static int cpu_stopper_thread(void *data) | |
239 | +{ | |
240 | + struct cpu_stopper *stopper = data; | |
241 | + struct cpu_stop_work *work; | |
242 | + int ret; | |
243 | + | |
244 | +repeat: | |
245 | + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | |
246 | + | |
247 | + if (kthread_should_stop()) { | |
248 | + __set_current_state(TASK_RUNNING); | |
249 | + return 0; | |
250 | + } | |
251 | + | |
252 | + work = NULL; | |
253 | + spin_lock_irq(&stopper->lock); | |
254 | + if (!list_empty(&stopper->works)) { | |
255 | + work = list_first_entry(&stopper->works, | |
256 | + struct cpu_stop_work, list); | |
257 | + list_del_init(&work->list); | |
258 | + } | |
259 | + spin_unlock_irq(&stopper->lock); | |
260 | + | |
261 | + if (work) { | |
262 | + cpu_stop_fn_t fn = work->fn; | |
263 | + void *arg = work->arg; | |
264 | + struct cpu_stop_done *done = work->done; | |
265 | + char ksym_buf[KSYM_NAME_LEN]; | |
266 | + | |
267 | + __set_current_state(TASK_RUNNING); | |
268 | + | |
269 | + /* cpu stop callbacks are not allowed to sleep */ | |
270 | + preempt_disable(); | |
271 | + | |
272 | + ret = fn(arg); | |
273 | + if (ret) | |
274 | + done->ret = ret; | |
275 | + | |
276 | + /* restore preemption and check it's still balanced */ | |
277 | + preempt_enable(); | |
278 | + WARN_ONCE(preempt_count(), | |
279 | + "cpu_stop: %s(%p) leaked preempt count\n", | |
280 | + kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, | |
281 | + ksym_buf), arg); | |
282 | + | |
283 | + cpu_stop_signal_done(done, true); | |
284 | + } else | |
285 | + schedule(); | |
286 | + | |
287 | + goto repeat; | |
288 | +} | |
289 | + | |
290 | +/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ | |
291 | +static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |
292 | + unsigned long action, void *hcpu) | |
293 | +{ | |
294 | + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | |
295 | + unsigned int cpu = (unsigned long)hcpu; | |
296 | + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | |
297 | + struct cpu_stop_work *work; | |
298 | + struct task_struct *p; | |
299 | + | |
300 | + switch (action & ~CPU_TASKS_FROZEN) { | |
301 | + case CPU_UP_PREPARE: | |
302 | + BUG_ON(stopper->thread || stopper->enabled || | |
303 | + !list_empty(&stopper->works)); | |
304 | + p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d", | |
305 | + cpu); | |
306 | + if (IS_ERR(p)) | |
307 | + return NOTIFY_BAD; | |
308 | + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | |
309 | + get_task_struct(p); | |
310 | + stopper->thread = p; | |
311 | + break; | |
312 | + | |
313 | + case CPU_ONLINE: | |
314 | + kthread_bind(stopper->thread, cpu); | |
315 | + /* strictly unnecessary, as first user will wake it */ | |
316 | + wake_up_process(stopper->thread); | |
317 | + /* mark enabled */ | |
318 | + spin_lock_irq(&stopper->lock); | |
319 | + stopper->enabled = true; | |
320 | + spin_unlock_irq(&stopper->lock); | |
321 | + break; | |
322 | + | |
323 | +#ifdef CONFIG_HOTPLUG_CPU | |
324 | + case CPU_UP_CANCELED: | |
325 | + case CPU_DEAD: | |
326 | + /* kill the stopper */ | |
327 | + kthread_stop(stopper->thread); | |
328 | + /* drain remaining works */ | |
329 | + spin_lock_irq(&stopper->lock); | |
330 | + list_for_each_entry(work, &stopper->works, list) | |
331 | + cpu_stop_signal_done(work->done, false); | |
332 | + stopper->enabled = false; | |
333 | + spin_unlock_irq(&stopper->lock); | |
334 | + /* release the stopper */ | |
335 | + put_task_struct(stopper->thread); | |
336 | + stopper->thread = NULL; | |
337 | + break; | |
338 | +#endif | |
339 | + } | |
340 | + | |
341 | + return NOTIFY_OK; | |
342 | +} | |
343 | + | |
344 | +/* | |
345 | + * Give it a higher priority so that cpu stopper is available to other | |
346 | + * cpu notifiers. It currently shares the same priority as sched | |
347 | + * migration_notifier. | |
348 | + */ | |
349 | +static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { | |
350 | + .notifier_call = cpu_stop_cpu_callback, | |
351 | + .priority = 10, | |
352 | +}; | |
353 | + | |
354 | +static int __init cpu_stop_init(void) | |
355 | +{ | |
356 | + void *bcpu = (void *)(long)smp_processor_id(); | |
357 | + unsigned int cpu; | |
358 | + int err; | |
359 | + | |
360 | + for_each_possible_cpu(cpu) { | |
361 | + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | |
362 | + | |
363 | + spin_lock_init(&stopper->lock); | |
364 | + INIT_LIST_HEAD(&stopper->works); | |
365 | + } | |
366 | + | |
367 | + /* start one for the boot cpu */ | |
368 | + err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, | |
369 | + bcpu); | |
370 | + BUG_ON(err == NOTIFY_BAD); | |
371 | + cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | |
372 | + register_cpu_notifier(&cpu_stop_cpu_notifier); | |
373 | + | |
374 | + return 0; | |
375 | +} | |
376 | +early_initcall(cpu_stop_init); | |
15 | 377 | |
16 | 378 | /* This controls the threads on each CPU. */ |
17 | 379 | enum stopmachine_state { |