Commit 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0

Authored by Tejun Heo
1 parent 50a323b730

sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining

Currently, when a cpu goes down, cpu_active is cleared before
CPU_DOWN_PREPARE starts and cpuset configuration is updated from a
default priority cpu notifier.  When a cpu is coming up, it's set
before CPU_ONLINE but cpuset configuration again is updated from the
same cpu notifier.

For cpu notifiers, this presents an inconsistent state.  Threads which
a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be
migrated to other cpus because the cpu is no more inactive.

Fix it by updating cpu_active in the highest priority cpu notifier and
cpuset configuration in the second highest when a cpu is coming up.
Down path is updated similarly.  This guarantees that all other cpu
notifiers see consistent cpu_active and cpuset configuration.

cpuset_track_online_cpus() notifier is converted to
cpuset_update_active_cpus() which just updates the configuration and
now called from cpuset_cpu_[in]active() notifiers registered from
sched_init_smp().  If cpuset is disabled, cpuset_update_active_cpus()
degenerates into partition_sched_domains() making separate notifier
for !CONFIG_CPUSETS unnecessary.

This problem is triggered by cmwq.  During CPU_DOWN_PREPARE, hotplug
callback creates a kthread and kthread_bind()s it to the target cpu,
and the thread is expected to run on that cpu.

* Ingo's test discovered __cpuinit/exit markups were incorrect.
  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Menage <menage@google.com>

Showing 5 changed files with 74 additions and 42 deletions Side-by-side Diff

... ... @@ -52,6 +52,22 @@
52 52 * CPU notifier priorities.
53 53 */
54 54 enum {
  55 + /*
  56 + * SCHED_ACTIVE marks a cpu which is coming up active during
  57 + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
  58 + * notifier. CPUSET_ACTIVE adjusts cpuset according to
  59 + * cpu_active mask right after SCHED_ACTIVE. During
  60 + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
  61 + * ordered in the similar way.
  62 + *
  63 + * This ordering guarantees consistent cpu_active mask and
  64 + * migration behavior to all cpu notifiers.
  65 + */
  66 + CPU_PRI_SCHED_ACTIVE = INT_MAX,
  67 + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
  68 + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
  69 + CPU_PRI_CPUSET_INACTIVE = INT_MIN,
  70 +
55 71 /* migration should happen before other stuff but after perf */
56 72 CPU_PRI_PERF = 20,
57 73 CPU_PRI_MIGRATION = 10,
include/linux/cpuset.h
... ... @@ -20,6 +20,7 @@
20 20  
21 21 extern int cpuset_init(void);
22 22 extern void cpuset_init_smp(void);
  23 +extern void cpuset_update_active_cpus(void);
23 24 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
24 25 extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
25 26 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
... ... @@ -131,6 +132,11 @@
131 132  
132 133 static inline int cpuset_init(void) { return 0; }
133 134 static inline void cpuset_init_smp(void) {}
  135 +
  136 +static inline void cpuset_update_active_cpus(void)
  137 +{
  138 + partition_sched_domains(1, NULL, NULL);
  139 +}
134 140  
135 141 static inline void cpuset_cpus_allowed(struct task_struct *p,
136 142 struct cpumask *mask)
... ... @@ -235,11 +235,8 @@
235 235 return -EINVAL;
236 236  
237 237 cpu_hotplug_begin();
238   - set_cpu_active(cpu, false);
239 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 239 if (err) {
241   - set_cpu_active(cpu, true);
242   -
243 240 nr_calls--;
244 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 242 printk("%s: attempt to take down CPU %u failed\n",
... ... @@ -249,7 +246,6 @@
249 246  
250 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 248 if (err) {
252   - set_cpu_active(cpu, true);
253 249 /* CPU didn't die: tell everyone. Can't complain. */
254 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251  
... ... @@ -320,8 +316,6 @@
320 316 if (ret != 0)
321 317 goto out_notify;
322 318 BUG_ON(!cpu_online(cpu));
323   -
324   - set_cpu_active(cpu, true);
325 319  
326 320 /* Now call notifier in preparation. */
327 321 cpu_notify(CPU_ONLINE | mod, hcpu);
... ... @@ -2113,31 +2113,17 @@
2113 2113 * but making no active use of cpusets.
2114 2114 *
2115 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116   - * cpu_online_map on each CPU hotplug (cpuhp) event.
  2116 + * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 2117 *
2118 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 2119 * before calling generate_sched_domains().
2120 2120 */
2121   -static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2122   - unsigned long phase, void *unused_cpu)
  2121 +void __cpuexit cpuset_update_active_cpus(void)
2123 2122 {
2124 2123 struct sched_domain_attr *attr;
2125 2124 cpumask_var_t *doms;
2126 2125 int ndoms;
2127 2126  
2128   - switch (phase) {
2129   - case CPU_ONLINE:
2130   - case CPU_ONLINE_FROZEN:
2131   - case CPU_DOWN_PREPARE:
2132   - case CPU_DOWN_PREPARE_FROZEN:
2133   - case CPU_DOWN_FAILED:
2134   - case CPU_DOWN_FAILED_FROZEN:
2135   - break;
2136   -
2137   - default:
2138   - return NOTIFY_DONE;
2139   - }
2140   -
2141 2127 cgroup_lock();
2142 2128 mutex_lock(&callback_mutex);
2143 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
... ... @@ -2148,8 +2134,6 @@
2148 2134  
2149 2135 /* Have scheduler rebuild the domains */
2150 2136 partition_sched_domains(ndoms, doms, attr);
2151   -
2152   - return NOTIFY_OK;
2153 2137 }
2154 2138  
2155 2139 #ifdef CONFIG_MEMORY_HOTPLUG
... ... @@ -2203,7 +2187,6 @@
2203 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189  
2206   - hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191  
2209 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
... ... @@ -5804,17 +5804,46 @@
5804 5804 .priority = CPU_PRI_MIGRATION,
5805 5805 };
5806 5806  
  5807 +static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
  5808 + unsigned long action, void *hcpu)
  5809 +{
  5810 + switch (action & ~CPU_TASKS_FROZEN) {
  5811 + case CPU_ONLINE:
  5812 + case CPU_DOWN_FAILED:
  5813 + set_cpu_active((long)hcpu, true);
  5814 + return NOTIFY_OK;
  5815 + default:
  5816 + return NOTIFY_DONE;
  5817 + }
  5818 +}
  5819 +
  5820 +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
  5821 + unsigned long action, void *hcpu)
  5822 +{
  5823 + switch (action & ~CPU_TASKS_FROZEN) {
  5824 + case CPU_DOWN_PREPARE:
  5825 + set_cpu_active((long)hcpu, false);
  5826 + return NOTIFY_OK;
  5827 + default:
  5828 + return NOTIFY_DONE;
  5829 + }
  5830 +}
  5831 +
5807 5832 static int __init migration_init(void)
5808 5833 {
5809 5834 void *cpu = (void *)(long)smp_processor_id();
5810 5835 int err;
5811 5836  
5812   - /* Start one for the boot CPU: */
  5837 + /* Initialize migration for the boot CPU */
5813 5838 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5814 5839 BUG_ON(err == NOTIFY_BAD);
5815 5840 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5816 5841 register_cpu_notifier(&migration_notifier);
5817 5842  
  5843 + /* Register cpu active notifiers */
  5844 + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
  5845 + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
  5846 +
5818 5847 return 0;
5819 5848 }
5820 5849 early_initcall(migration_init);
5821 5850  
5822 5851  
5823 5852  
5824 5853  
5825 5854  
5826 5855  
5827 5856  
5828 5857  
... ... @@ -7273,29 +7302,35 @@
7273 7302 }
7274 7303 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7275 7304  
7276   -#ifndef CONFIG_CPUSETS
7277 7305 /*
7278   - * Add online and remove offline CPUs from the scheduler domains.
7279   - * When cpusets are enabled they take over this function.
  7306 + * Update cpusets according to cpu_active mask. If cpusets are
  7307 + * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  7308 + * around partition_sched_domains().
7280 7309 */
7281   -static int update_sched_domains(struct notifier_block *nfb,
7282   - unsigned long action, void *hcpu)
  7310 +static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
  7311 + unsigned long action, void *hcpu)
7283 7312 {
7284   - switch (action) {
  7313 + switch (action & ~CPU_TASKS_FROZEN) {
7285 7314 case CPU_ONLINE:
7286   - case CPU_ONLINE_FROZEN:
7287   - case CPU_DOWN_PREPARE:
7288   - case CPU_DOWN_PREPARE_FROZEN:
7289 7315 case CPU_DOWN_FAILED:
7290   - case CPU_DOWN_FAILED_FROZEN:
7291   - partition_sched_domains(1, NULL, NULL);
  7316 + cpuset_update_active_cpus();
7292 7317 return NOTIFY_OK;
  7318 + default:
  7319 + return NOTIFY_DONE;
  7320 + }
  7321 +}
7293 7322  
  7323 +static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
  7324 + unsigned long action, void *hcpu)
  7325 +{
  7326 + switch (action & ~CPU_TASKS_FROZEN) {
  7327 + case CPU_DOWN_PREPARE:
  7328 + cpuset_update_active_cpus();
  7329 + return NOTIFY_OK;
7294 7330 default:
7295 7331 return NOTIFY_DONE;
7296 7332 }
7297 7333 }
7298   -#endif
7299 7334  
7300 7335 static int update_runtime(struct notifier_block *nfb,
7301 7336 unsigned long action, void *hcpu)
... ... @@ -7341,10 +7376,8 @@
7341 7376 mutex_unlock(&sched_domains_mutex);
7342 7377 put_online_cpus();
7343 7378  
7344   -#ifndef CONFIG_CPUSETS
7345   - /* XXX: Theoretical race here - CPU may be hotplugged now */
7346   - hotcpu_notifier(update_sched_domains, 0);
7347   -#endif
  7379 + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
  7380 + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7348 7381  
7349 7382 /* RT runtime code needs to handle some hotplug events */
7350 7383 hotcpu_notifier(update_runtime, 0);