Commit 1142d810298e694754498dbb4983fcb6cb7fd884

Authored by Tejun Heo
1 parent 99bd5e2f24

cpu_stop: implement stop_cpu[s]()

Implement a simplistic per-cpu maximum priority cpu monopolization
mechanism.  A non-sleeping callback can be scheduled to run on one or
multiple cpus with maximum priority monopolozing those cpus.  This is
primarily to replace and unify RT workqueue usage in stop_machine and
scheduler migration_thread which currently is serving multiple
purposes.

Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(),
stop_cpus() and try_stop_cpus().

This is to allow clean sharing of resources among stop_cpu and all the
migration thread users.  One stopper thread per cpu is created which
is currently named "stopper/CPU".  This will eventually replace the
migration thread and take on its name.

* This facility was originally named cpuhog and lived in separate
  files but Peter Zijlstra nacked the name and thus got renamed to
  cpu_stop and moved into stop_machine.c.

* Better reporting of preemption leak as per Peter's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>

Showing 2 changed files with 402 additions and 9 deletions Side-by-side Diff

include/linux/stop_machine.h
1 1 #ifndef _LINUX_STOP_MACHINE
2 2 #define _LINUX_STOP_MACHINE
3   -/* "Bogolock": stop the entire machine, disable interrupts. This is a
4   - very heavy lock, which is equivalent to grabbing every spinlock
5   - (and more). So the "read" side to such a lock is anything which
6   - disables preeempt. */
  3 +
7 4 #include <linux/cpu.h>
8 5 #include <linux/cpumask.h>
  6 +#include <linux/list.h>
9 7 #include <asm/system.h>
10 8  
11 9 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
  10 +
  11 +/*
  12 + * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
  13 + * monopolization mechanism. The caller can specify a non-sleeping
  14 + * function to be executed on a single or multiple cpus preempting all
  15 + * other processes and monopolizing those cpus until it finishes.
  16 + *
  17 + * Resources for this mechanism are preallocated when a cpu is brought
  18 + * up and requests are guaranteed to be served as long as the target
  19 + * cpus are online.
  20 + */
  21 +
  22 +typedef int (*cpu_stop_fn_t)(void *arg);
  23 +
  24 +struct cpu_stop_work {
  25 + struct list_head list; /* cpu_stopper->works */
  26 + cpu_stop_fn_t fn;
  27 + void *arg;
  28 + struct cpu_stop_done *done;
  29 +};
  30 +
  31 +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
  32 +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
  33 + struct cpu_stop_work *work_buf);
  34 +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
  35 +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
  36 +
  37 +/*
  38 + * stop_machine "Bogolock": stop the entire machine, disable
  39 + * interrupts. This is a very heavy lock, which is equivalent to
  40 + * grabbing every spinlock (and more). So the "read" side to such a
  41 + * lock is anything which disables preeempt.
  42 + */
12 43  
13 44 /**
14 45 * stop_machine: freeze the machine on all CPUs and run this function
kernel/stop_machine.c
1   -/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2   - * GPL v2 and any later version.
  1 +/*
  2 + * kernel/stop_machine.c
  3 + *
  4 + * Copyright (C) 2008, 2005 IBM Corporation.
  5 + * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
  6 + * Copyright (C) 2010 SUSE Linux Products GmbH
  7 + * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
  8 + *
  9 + * This file is released under the GPLv2 and any later version.
3 10 */
  11 +#include <linux/completion.h>
4 12 #include <linux/cpu.h>
5   -#include <linux/err.h>
  13 +#include <linux/init.h>
6 14 #include <linux/kthread.h>
7 15 #include <linux/module.h>
  16 +#include <linux/percpu.h>
8 17 #include <linux/sched.h>
9 18 #include <linux/stop_machine.h>
10   -#include <linux/syscalls.h>
11 19 #include <linux/interrupt.h>
  20 +#include <linux/kallsyms.h>
12 21  
13 22 #include <asm/atomic.h>
14   -#include <asm/uaccess.h>
  23 +
  24 +/*
  25 + * Structure to determine completion condition and record errors. May
  26 + * be shared by works on different cpus.
  27 + */
  28 +struct cpu_stop_done {
  29 + atomic_t nr_todo; /* nr left to execute */
  30 + bool executed; /* actually executed? */
  31 + int ret; /* collected return value */
  32 + struct completion completion; /* fired if nr_todo reaches 0 */
  33 +};
  34 +
  35 +/* the actual stopper, one per every possible cpu, enabled on online cpus */
  36 +struct cpu_stopper {
  37 + spinlock_t lock;
  38 + struct list_head works; /* list of pending works */
  39 + struct task_struct *thread; /* stopper thread */
  40 + bool enabled; /* is this stopper enabled? */
  41 +};
  42 +
  43 +static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
  44 +
  45 +static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
  46 +{
  47 + memset(done, 0, sizeof(*done));
  48 + atomic_set(&done->nr_todo, nr_todo);
  49 + init_completion(&done->completion);
  50 +}
  51 +
  52 +/* signal completion unless @done is NULL */
  53 +static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
  54 +{
  55 + if (done) {
  56 + if (executed)
  57 + done->executed = true;
  58 + if (atomic_dec_and_test(&done->nr_todo))
  59 + complete(&done->completion);
  60 + }
  61 +}
  62 +
  63 +/* queue @work to @stopper. if offline, @work is completed immediately */
  64 +static void cpu_stop_queue_work(struct cpu_stopper *stopper,
  65 + struct cpu_stop_work *work)
  66 +{
  67 + unsigned long flags;
  68 +
  69 + spin_lock_irqsave(&stopper->lock, flags);
  70 +
  71 + if (stopper->enabled) {
  72 + list_add_tail(&work->list, &stopper->works);
  73 + wake_up_process(stopper->thread);
  74 + } else
  75 + cpu_stop_signal_done(work->done, false);
  76 +
  77 + spin_unlock_irqrestore(&stopper->lock, flags);
  78 +}
  79 +
  80 +/**
  81 + * stop_one_cpu - stop a cpu
  82 + * @cpu: cpu to stop
  83 + * @fn: function to execute
  84 + * @arg: argument to @fn
  85 + *
  86 + * Execute @fn(@arg) on @cpu. @fn is run in a process context with
  87 + * the highest priority preempting any task on the cpu and
  88 + * monopolizing it. This function returns after the execution is
  89 + * complete.
  90 + *
  91 + * This function doesn't guarantee @cpu stays online till @fn
  92 + * completes. If @cpu goes down in the middle, execution may happen
  93 + * partially or fully on different cpus. @fn should either be ready
  94 + * for that or the caller should ensure that @cpu stays online until
  95 + * this function completes.
  96 + *
  97 + * CONTEXT:
  98 + * Might sleep.
  99 + *
  100 + * RETURNS:
  101 + * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
  102 + * otherwise, the return value of @fn.
  103 + */
  104 +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
  105 +{
  106 + struct cpu_stop_done done;
  107 + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
  108 +
  109 + cpu_stop_init_done(&done, 1);
  110 + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
  111 + wait_for_completion(&done.completion);
  112 + return done.executed ? done.ret : -ENOENT;
  113 +}
  114 +
  115 +/**
  116 + * stop_one_cpu_nowait - stop a cpu but don't wait for completion
  117 + * @cpu: cpu to stop
  118 + * @fn: function to execute
  119 + * @arg: argument to @fn
  120 + *
  121 + * Similar to stop_one_cpu() but doesn't wait for completion. The
  122 + * caller is responsible for ensuring @work_buf is currently unused
  123 + * and will remain untouched until stopper starts executing @fn.
  124 + *
  125 + * CONTEXT:
  126 + * Don't care.
  127 + */
  128 +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
  129 + struct cpu_stop_work *work_buf)
  130 +{
  131 + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
  132 + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
  133 +}
  134 +
  135 +/* static data for stop_cpus */
  136 +static DEFINE_MUTEX(stop_cpus_mutex);
  137 +static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
  138 +
  139 +int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
  140 +{
  141 + struct cpu_stop_work *work;
  142 + struct cpu_stop_done done;
  143 + unsigned int cpu;
  144 +
  145 + /* initialize works and done */
  146 + for_each_cpu(cpu, cpumask) {
  147 + work = &per_cpu(stop_cpus_work, cpu);
  148 + work->fn = fn;
  149 + work->arg = arg;
  150 + work->done = &done;
  151 + }
  152 + cpu_stop_init_done(&done, cpumask_weight(cpumask));
  153 +
  154 + /*
  155 + * Disable preemption while queueing to avoid getting
  156 + * preempted by a stopper which might wait for other stoppers
  157 + * to enter @fn which can lead to deadlock.
  158 + */
  159 + preempt_disable();
  160 + for_each_cpu(cpu, cpumask)
  161 + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
  162 + &per_cpu(stop_cpus_work, cpu));
  163 + preempt_enable();
  164 +
  165 + wait_for_completion(&done.completion);
  166 + return done.executed ? done.ret : -ENOENT;
  167 +}
  168 +
  169 +/**
  170 + * stop_cpus - stop multiple cpus
  171 + * @cpumask: cpus to stop
  172 + * @fn: function to execute
  173 + * @arg: argument to @fn
  174 + *
  175 + * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
  176 + * @fn is run in a process context with the highest priority
  177 + * preempting any task on the cpu and monopolizing it. This function
  178 + * returns after all executions are complete.
  179 + *
  180 + * This function doesn't guarantee the cpus in @cpumask stay online
  181 + * till @fn completes. If some cpus go down in the middle, execution
  182 + * on the cpu may happen partially or fully on different cpus. @fn
  183 + * should either be ready for that or the caller should ensure that
  184 + * the cpus stay online until this function completes.
  185 + *
  186 + * All stop_cpus() calls are serialized making it safe for @fn to wait
  187 + * for all cpus to start executing it.
  188 + *
  189 + * CONTEXT:
  190 + * Might sleep.
  191 + *
  192 + * RETURNS:
  193 + * -ENOENT if @fn(@arg) was not executed at all because all cpus in
  194 + * @cpumask were offline; otherwise, 0 if all executions of @fn
  195 + * returned 0, any non zero return value if any returned non zero.
  196 + */
  197 +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
  198 +{
  199 + int ret;
  200 +
  201 + /* static works are used, process one request at a time */
  202 + mutex_lock(&stop_cpus_mutex);
  203 + ret = __stop_cpus(cpumask, fn, arg);
  204 + mutex_unlock(&stop_cpus_mutex);
  205 + return ret;
  206 +}
  207 +
  208 +/**
  209 + * try_stop_cpus - try to stop multiple cpus
  210 + * @cpumask: cpus to stop
  211 + * @fn: function to execute
  212 + * @arg: argument to @fn
  213 + *
  214 + * Identical to stop_cpus() except that it fails with -EAGAIN if
  215 + * someone else is already using the facility.
  216 + *
  217 + * CONTEXT:
  218 + * Might sleep.
  219 + *
  220 + * RETURNS:
  221 + * -EAGAIN if someone else is already stopping cpus, -ENOENT if
  222 + * @fn(@arg) was not executed at all because all cpus in @cpumask were
  223 + * offline; otherwise, 0 if all executions of @fn returned 0, any non
  224 + * zero return value if any returned non zero.
  225 + */
  226 +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
  227 +{
  228 + int ret;
  229 +
  230 + /* static works are used, process one request at a time */
  231 + if (!mutex_trylock(&stop_cpus_mutex))
  232 + return -EAGAIN;
  233 + ret = __stop_cpus(cpumask, fn, arg);
  234 + mutex_unlock(&stop_cpus_mutex);
  235 + return ret;
  236 +}
  237 +
  238 +static int cpu_stopper_thread(void *data)
  239 +{
  240 + struct cpu_stopper *stopper = data;
  241 + struct cpu_stop_work *work;
  242 + int ret;
  243 +
  244 +repeat:
  245 + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
  246 +
  247 + if (kthread_should_stop()) {
  248 + __set_current_state(TASK_RUNNING);
  249 + return 0;
  250 + }
  251 +
  252 + work = NULL;
  253 + spin_lock_irq(&stopper->lock);
  254 + if (!list_empty(&stopper->works)) {
  255 + work = list_first_entry(&stopper->works,
  256 + struct cpu_stop_work, list);
  257 + list_del_init(&work->list);
  258 + }
  259 + spin_unlock_irq(&stopper->lock);
  260 +
  261 + if (work) {
  262 + cpu_stop_fn_t fn = work->fn;
  263 + void *arg = work->arg;
  264 + struct cpu_stop_done *done = work->done;
  265 + char ksym_buf[KSYM_NAME_LEN];
  266 +
  267 + __set_current_state(TASK_RUNNING);
  268 +
  269 + /* cpu stop callbacks are not allowed to sleep */
  270 + preempt_disable();
  271 +
  272 + ret = fn(arg);
  273 + if (ret)
  274 + done->ret = ret;
  275 +
  276 + /* restore preemption and check it's still balanced */
  277 + preempt_enable();
  278 + WARN_ONCE(preempt_count(),
  279 + "cpu_stop: %s(%p) leaked preempt count\n",
  280 + kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
  281 + ksym_buf), arg);
  282 +
  283 + cpu_stop_signal_done(done, true);
  284 + } else
  285 + schedule();
  286 +
  287 + goto repeat;
  288 +}
  289 +
  290 +/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
  291 +static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
  292 + unsigned long action, void *hcpu)
  293 +{
  294 + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
  295 + unsigned int cpu = (unsigned long)hcpu;
  296 + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  297 + struct cpu_stop_work *work;
  298 + struct task_struct *p;
  299 +
  300 + switch (action & ~CPU_TASKS_FROZEN) {
  301 + case CPU_UP_PREPARE:
  302 + BUG_ON(stopper->thread || stopper->enabled ||
  303 + !list_empty(&stopper->works));
  304 + p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d",
  305 + cpu);
  306 + if (IS_ERR(p))
  307 + return NOTIFY_BAD;
  308 + sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
  309 + get_task_struct(p);
  310 + stopper->thread = p;
  311 + break;
  312 +
  313 + case CPU_ONLINE:
  314 + kthread_bind(stopper->thread, cpu);
  315 + /* strictly unnecessary, as first user will wake it */
  316 + wake_up_process(stopper->thread);
  317 + /* mark enabled */
  318 + spin_lock_irq(&stopper->lock);
  319 + stopper->enabled = true;
  320 + spin_unlock_irq(&stopper->lock);
  321 + break;
  322 +
  323 +#ifdef CONFIG_HOTPLUG_CPU
  324 + case CPU_UP_CANCELED:
  325 + case CPU_DEAD:
  326 + /* kill the stopper */
  327 + kthread_stop(stopper->thread);
  328 + /* drain remaining works */
  329 + spin_lock_irq(&stopper->lock);
  330 + list_for_each_entry(work, &stopper->works, list)
  331 + cpu_stop_signal_done(work->done, false);
  332 + stopper->enabled = false;
  333 + spin_unlock_irq(&stopper->lock);
  334 + /* release the stopper */
  335 + put_task_struct(stopper->thread);
  336 + stopper->thread = NULL;
  337 + break;
  338 +#endif
  339 + }
  340 +
  341 + return NOTIFY_OK;
  342 +}
  343 +
  344 +/*
  345 + * Give it a higher priority so that cpu stopper is available to other
  346 + * cpu notifiers. It currently shares the same priority as sched
  347 + * migration_notifier.
  348 + */
  349 +static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
  350 + .notifier_call = cpu_stop_cpu_callback,
  351 + .priority = 10,
  352 +};
  353 +
  354 +static int __init cpu_stop_init(void)
  355 +{
  356 + void *bcpu = (void *)(long)smp_processor_id();
  357 + unsigned int cpu;
  358 + int err;
  359 +
  360 + for_each_possible_cpu(cpu) {
  361 + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  362 +
  363 + spin_lock_init(&stopper->lock);
  364 + INIT_LIST_HEAD(&stopper->works);
  365 + }
  366 +
  367 + /* start one for the boot cpu */
  368 + err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
  369 + bcpu);
  370 + BUG_ON(err == NOTIFY_BAD);
  371 + cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
  372 + register_cpu_notifier(&cpu_stop_cpu_notifier);
  373 +
  374 + return 0;
  375 +}
  376 +early_initcall(cpu_stop_init);
15 377  
16 378 /* This controls the threads on each CPU. */
17 379 enum stopmachine_state {