sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining

Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: Paul Menage <menage@google.com>

sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining
Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: Paul Menage <menage@google.com>
Tejun Heo
1 parent 50a323b730
Showing 5 changed files with 74 additions and 42 deletions Side-by-side Diff
include/linux/cpu.h
include/linux/cpuset.h
kernel/cpu.c
kernel/cpuset.c
kernel/sched.c
@@ -52,6 +52,22 @@
  * CPU notifier priorities.
  */
 enum {
+	/*
+	 * SCHED_ACTIVE marks a cpu which is coming up active during
+	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
+	 * notifier.  CPUSET_ACTIVE adjusts cpuset according to
+	 * cpu_active mask right after SCHED_ACTIVE.  During
+	 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
+	 * ordered in the similar way.
+	 *
+	 * This ordering guarantees consistent cpu_active mask and
+	 * migration behavior to all cpu notifiers.
+	 */
+	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
+	CPU_PRI_CPUSET_ACTIVE	= INT_MAX - 1,
+	CPU_PRI_SCHED_INACTIVE	= INT_MIN + 1,
+	CPU_PRI_CPUSET_INACTIVE	= INT_MIN,
+
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
@@ -20,6 +20,7 @@
  
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -131,6 +132,11 @@
  
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
+
+static inline void cpuset_update_active_cpus(void)
+{
+	partition_sched_domains(1, NULL, NULL);
+}
  
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
@@ -235,11 +235,8 @@
 		return -EINVAL;
  
 	cpu_hotplug_begin();
-	set_cpu_active(cpu, false);
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
-		set_cpu_active(cpu, true);
-
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@
  
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
-		set_cpu_active(cpu, true);
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
  
@@ -320,8 +316,6 @@
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
-
-	set_cpu_active(cpu, true);
  
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
@@ -2113,31 +2113,17 @@
  * but making no active use of cpusets.
  *
  * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
  *
  * Called within get_online_cpus().  Needs to call cgroup_lock()
  * before calling generate_sched_domains().
  */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-				unsigned long phase, void *unused_cpu)
+void __cpuexit cpuset_update_active_cpus(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
  
-	switch (phase) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
 	cgroup_lock();
 	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@
  
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-
-	return NOTIFY_OK;
 }
  
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
  
-	hotcpu_notifier(cpuset_track_online_cpus, 0);
 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
  
 	cpuset_wq = create_singlethread_workqueue("cpuset");
@@ -5804,17 +5804,46 @@
 	.priority = CPU_PRI_MIGRATION,
 };
  
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
  
-	/* Start one for the boot CPU: */
+	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
  
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
 	return 0;
 }
 early_initcall(migration_init);
  
  
  
  
  
  
  
  
@@ -7273,29 +7302,35 @@
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
  */
-static int update_sched_domains(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
+				       unsigned long action, void *hcpu)
 {
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		partition_sched_domains(1, NULL, NULL);
+		cpuset_update_active_cpus();
 		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
  
+static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
+					 unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
-#endif
  
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
@@ -7341,10 +7376,8 @@
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
  
-#ifndef CONFIG_CPUSETS
-	/* XXX: Theoretical race here - CPU may be hotplugged now */
-	hotcpu_notifier(update_sched_domains, 0);
-#endif
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
  
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
...	...	@@ -52,6 +52,22 @@
52	52	* CPU notifier priorities.
53	53	*/
54	54	enum {
	55	+ /*
	56	+ * SCHED_ACTIVE marks a cpu which is coming up active during
	57	+ * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
	58	+ * notifier. CPUSET_ACTIVE adjusts cpuset according to
	59	+ * cpu_active mask right after SCHED_ACTIVE. During
	60	+ * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
	61	+ * ordered in the similar way.
	62	+ *
	63	+ * This ordering guarantees consistent cpu_active mask and
	64	+ * migration behavior to all cpu notifiers.
	65	+ */
	66	+ CPU_PRI_SCHED_ACTIVE = INT_MAX,
	67	+ CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
	68	+ CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
	69	+ CPU_PRI_CPUSET_INACTIVE = INT_MIN,
	70	+
55	71	/* migration should happen before other stuff but after perf */
56	72	CPU_PRI_PERF = 20,
57	73	CPU_PRI_MIGRATION = 10,
...	...	@@ -20,6 +20,7 @@
20	20
21	21	extern int cpuset_init(void);
22	22	extern void cpuset_init_smp(void);
	23	+extern void cpuset_update_active_cpus(void);
23	24	extern void cpuset_cpus_allowed(struct task_struct p, struct cpumask mask);
24	25	extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
25	26	extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
...	...	@@ -131,6 +132,11 @@
131	132
132	133	static inline int cpuset_init(void) { return 0; }
133	134	static inline void cpuset_init_smp(void) {}
	135	+
	136	+static inline void cpuset_update_active_cpus(void)
	137	+{
	138	+ partition_sched_domains(1, NULL, NULL);
	139	+}
134	140
135	141	static inline void cpuset_cpus_allowed(struct task_struct *p,
136	142	struct cpumask *mask)
...	...	@@ -235,11 +235,8 @@
235	235	return -EINVAL;
236	236
237	237	cpu_hotplug_begin();
238		- set_cpu_active(cpu, false);
239	238	err = __cpu_notify(CPU_DOWN_PREPARE \| mod, hcpu, -1, &nr_calls);
240	239	if (err) {
241		- set_cpu_active(cpu, true);
242		-
243	240	nr_calls--;
244	241	__cpu_notify(CPU_DOWN_FAILED \| mod, hcpu, nr_calls, NULL);
245	242	printk("%s: attempt to take down CPU %u failed\n",
...	...	@@ -249,7 +246,6 @@
249	246
250	247	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251	248	if (err) {
252		- set_cpu_active(cpu, true);
253	249	/* CPU didn't die: tell everyone. Can't complain. */
254	250	cpu_notify_nofail(CPU_DOWN_FAILED \| mod, hcpu);
255	251
...	...	@@ -320,8 +316,6 @@
320	316	if (ret != 0)
321	317	goto out_notify;
322	318	BUG_ON(!cpu_online(cpu));
323		-
324		- set_cpu_active(cpu, true);
325	319
326	320	/* Now call notifier in preparation. */
327	321	cpu_notify(CPU_ONLINE \| mod, hcpu);
...	...	@@ -2113,31 +2113,17 @@
2113	2113	* but making no active use of cpusets.
2114	2114	*
2115	2115	* This routine ensures that top_cpuset.cpus_allowed tracks
2116		- * cpu_online_map on each CPU hotplug (cpuhp) event.
	2116	+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117	2117	*
2118	2118	* Called within get_online_cpus(). Needs to call cgroup_lock()
2119	2119	* before calling generate_sched_domains().
2120	2120	*/
2121		-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2122		- unsigned long phase, void *unused_cpu)
	2121	+void __cpuexit cpuset_update_active_cpus(void)
2123	2122	{
2124	2123	struct sched_domain_attr *attr;
2125	2124	cpumask_var_t *doms;
2126	2125	int ndoms;
2127	2126
2128		- switch (phase) {
2129		- case CPU_ONLINE:
2130		- case CPU_ONLINE_FROZEN:
2131		- case CPU_DOWN_PREPARE:
2132		- case CPU_DOWN_PREPARE_FROZEN:
2133		- case CPU_DOWN_FAILED:
2134		- case CPU_DOWN_FAILED_FROZEN:
2135		- break;
2136		-
2137		- default:
2138		- return NOTIFY_DONE;
2139		- }
2140		-
2141	2127	cgroup_lock();
2142	2128	mutex_lock(&callback_mutex);
2143	2129	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
...	...	@@ -2148,8 +2134,6 @@
2148	2134
2149	2135	/* Have scheduler rebuild the domains */
2150	2136	partition_sched_domains(ndoms, doms, attr);
2151		-
2152		- return NOTIFY_OK;
2153	2137	}
2154	2138
2155	2139	#ifdef CONFIG_MEMORY_HOTPLUG
...	...	@@ -2203,7 +2187,6 @@
2203	2187	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204	2188	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205	2189
2206		- hotcpu_notifier(cpuset_track_online_cpus, 0);
2207	2190	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208	2191
2209	2192	cpuset_wq = create_singlethread_workqueue("cpuset");
...	...	@@ -5804,17 +5804,46 @@
5804	5804	.priority = CPU_PRI_MIGRATION,
5805	5805	};
5806	5806
	5807	+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
	5808	+ unsigned long action, void *hcpu)
	5809	+{
	5810	+ switch (action & ~CPU_TASKS_FROZEN) {
	5811	+ case CPU_ONLINE:
	5812	+ case CPU_DOWN_FAILED:
	5813	+ set_cpu_active((long)hcpu, true);
	5814	+ return NOTIFY_OK;
	5815	+ default:
	5816	+ return NOTIFY_DONE;
	5817	+ }
	5818	+}
	5819	+
	5820	+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
	5821	+ unsigned long action, void *hcpu)
	5822	+{
	5823	+ switch (action & ~CPU_TASKS_FROZEN) {
	5824	+ case CPU_DOWN_PREPARE:
	5825	+ set_cpu_active((long)hcpu, false);
	5826	+ return NOTIFY_OK;
	5827	+ default:
	5828	+ return NOTIFY_DONE;
	5829	+ }
	5830	+}
	5831	+
5807	5832	static int __init migration_init(void)
5808	5833	{
5809	5834	void cpu = (void )(long)smp_processor_id();
5810	5835	int err;
5811	5836
5812		- /* Start one for the boot CPU: */
	5837	+ /* Initialize migration for the boot CPU */
5813	5838	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5814	5839	BUG_ON(err == NOTIFY_BAD);
5815	5840	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5816	5841	register_cpu_notifier(&migration_notifier);
5817	5842
	5843	+ /* Register cpu active notifiers */
	5844	+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
	5845	+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
	5846	+
5818	5847	return 0;
5819	5848	}
5820	5849	early_initcall(migration_init);
5821	5850
5822	5851
5823	5852
5824	5853
5825	5854
5826	5855
5827	5856
5828	5857
...	...	@@ -7273,29 +7302,35 @@
7273	7302	}
7274	7303	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */
7275	7304
7276		-#ifndef CONFIG_CPUSETS
7277	7305	/*
7278		- * Add online and remove offline CPUs from the scheduler domains.
7279		- * When cpusets are enabled they take over this function.
	7306	+ * Update cpusets according to cpu_active mask. If cpusets are
	7307	+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
	7308	+ * around partition_sched_domains().
7280	7309	*/
7281		-static int update_sched_domains(struct notifier_block *nfb,
7282		- unsigned long action, void *hcpu)
	7310	+static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
	7311	+ unsigned long action, void *hcpu)
7283	7312	{
7284		- switch (action) {
	7313	+ switch (action & ~CPU_TASKS_FROZEN) {
7285	7314	case CPU_ONLINE:
7286		- case CPU_ONLINE_FROZEN:
7287		- case CPU_DOWN_PREPARE:
7288		- case CPU_DOWN_PREPARE_FROZEN:
7289	7315	case CPU_DOWN_FAILED:
7290		- case CPU_DOWN_FAILED_FROZEN:
7291		- partition_sched_domains(1, NULL, NULL);
	7316	+ cpuset_update_active_cpus();
7292	7317	return NOTIFY_OK;
	7318	+ default:
	7319	+ return NOTIFY_DONE;
	7320	+ }
	7321	+}
7293	7322
	7323	+static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
	7324	+ unsigned long action, void *hcpu)
	7325	+{
	7326	+ switch (action & ~CPU_TASKS_FROZEN) {
	7327	+ case CPU_DOWN_PREPARE:
	7328	+ cpuset_update_active_cpus();
	7329	+ return NOTIFY_OK;
7294	7330	default:
7295	7331	return NOTIFY_DONE;
7296	7332	}
7297	7333	}
7298		-#endif
7299	7334
7300	7335	static int update_runtime(struct notifier_block *nfb,
7301	7336	unsigned long action, void *hcpu)
...	...	@@ -7341,10 +7376,8 @@
7341	7376	mutex_unlock(&sched_domains_mutex);
7342	7377	put_online_cpus();
7343	7378
7344		-#ifndef CONFIG_CPUSETS
7345		- /* XXX: Theoretical race here - CPU may be hotplugged now */
7346		- hotcpu_notifier(update_sched_domains, 0);
7347		-#endif
	7379	+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
	7380	+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7348	7381
7349	7382	/* RT runtime code needs to handle some hotplug events */
7350	7383	hotcpu_notifier(update_runtime, 0);