cpuset: don't nest cgroup_mutex inside get_online_cpus()

CPU / memory hotplug path currently grabs cgroup_mutex from hotplug event notifications. We want to separate cpuset locking from cgroup core and make cgroup_mutex outer to hotplug synchronization so that, among other things, mechanisms which depend on get_online_cpus() can be used from cgroup callbacks. In general, we want to keep cgroup_mutex the outermost lock to minimize locking interactions among different controllers. Convert cpuset_handle_hotplug() to cpuset_hotplug_workfn() and schedule it from the hotplug notifications. As the function can already handle multiple mixed events without any input, converting it to a work function is mostly trivial; however, one complication is that cpuset_update_active_cpus() needs to update sched domains synchronously to reflect an offlined cpu to avoid confusing the scheduler. This is worked around by falling back to the the default single sched domain synchronously before scheduling the actual hotplug work. This makes sched domain rebuilt twice per CPU hotplug event but the operation isn't that heavy and a lot of the second operation would be noop for systems w/ single sched domain, which is the common case. This decouples cpuset hotplug handling from the notification callbacks and there can be an arbitrary delay between the actual event and updates to cpusets. Scheduler and mm can handle it fine but moving tasks out of an empty cpuset may race against writes to the cpuset restoring execution resources which can lead to confusing behavior. Flush hotplug work item from cpuset_write_resmask() to avoid such confusions. v2: Synchronous sched domain rebuilding using the fallback sched domain added. This fixes various issues caused by confused scheduler putting tasks on a dead CPU, including the one reported by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>

cpuset: don't nest cgroup_mutex inside get_online_cpus()
CPU / memory hotplug path currently grabs cgroup_mutex from hotplug event notifications. We want to separate cpuset locking from cgroup core and make cgroup_mutex outer to hotplug synchronization so that, among other things, mechanisms which depend on get_online_cpus() can be used from cgroup callbacks. In general, we want to keep cgroup_mutex the outermost lock to minimize locking interactions among different controllers. Convert cpuset_handle_hotplug() to cpuset_hotplug_workfn() and schedule it from the hotplug notifications. As the function can already handle multiple mixed events without any input, converting it to a work function is mostly trivial; however, one complication is that cpuset_update_active_cpus() needs to update sched domains synchronously to reflect an offlined cpu to avoid confusing the scheduler. This is worked around by falling back to the the default single sched domain synchronously before scheduling the actual hotplug work. This makes sched domain rebuilt twice per CPU hotplug event but the operation isn't that heavy and a lot of the second operation would be noop for systems w/ single sched domain, which is the common case. This decouples cpuset hotplug handling from the notification callbacks and there can be an arbitrary delay between the actual event and updates to cpusets. Scheduler and mm can handle it fine but moving tasks out of an empty cpuset may race against writes to the cpuset restoring execution resources which can lead to confusing behavior. Flush hotplug work item from cpuset_write_resmask() to avoid such confusions. v2: Synchronous sched domain rebuilding using the fallback sched domain added. This fixes various issues caused by confused scheduler putting tasks on a dead CPU, including the one reported by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
Tejun Heo
1 parent deb7aa308e
Showing 1 changed file with 35 additions and 4 deletions Side-by-side Diff
kernel/cpuset.c
@@ -260,6 +260,13 @@
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
  
 /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+/*
  * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
  * silently switch it to mount "cgroup" instead
@@ -1565,6 +1572,19 @@
 	struct cpuset *cs = cgroup_cs(cgrp);
 	struct cpuset *trialcs;
  
+	/*
+	 * CPU or memory hotunplug may leave @cs w/o any execution
+	 * resources, in which case the hotplug code asynchronously updates
+	 * configuration and transfers all tasks to the nearest ancestor
+	 * which can execute.
+	 *
+	 * As writes to "cpus" or "mems" may restore @cs's execution
+	 * resources, wait for the previously scheduled operations before
+	 * proceeding, so that we don't end up keep removing tasks added
+	 * after execution capability is restored.
+	 */
+	flush_work(&cpuset_hotplug_work);
+
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
  
@@ -2095,7 +2115,7 @@
 }
  
 /**
- * cpuset_handle_hotplug - handle CPU/memory hot[un]plug
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
  * This function is called after either CPU or memory configuration has
  * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -2110,7 +2130,7 @@
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
  */
-static void cpuset_handle_hotplug(void)
+static void cpuset_hotplug_workfn(struct work_struct *work)
 {
 	static cpumask_t new_cpus, tmp_cpus;
 	static nodemask_t new_mems, tmp_mems;
@@ -2177,7 +2197,18 @@
  
 void cpuset_update_active_cpus(bool cpu_online)
 {
-	cpuset_handle_hotplug();
+	/*
+	 * We're inside cpu hotplug critical region which usually nests
+	 * inside cgroup synchronization.  Bounce actual hotplug processing
+	 * to a work item to avoid reverse locking order.
+	 *
+	 * We still need to do partition_sched_domains() synchronously;
+	 * otherwise, the scheduler will get confused and put tasks to the
+	 * dead CPU.  Fall back to the default single domain.
+	 * cpuset_hotplug_workfn() will rebuild it as necessary.
+	 */
+	partition_sched_domains(1, NULL, NULL);
+	schedule_work(&cpuset_hotplug_work);
 }
  
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2189,7 +2220,7 @@
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	cpuset_handle_hotplug();
+	schedule_work(&cpuset_hotplug_work);
 	return NOTIFY_OK;
 }
 #endif
...	...	@@ -260,6 +260,13 @@
260	260	static DEFINE_SPINLOCK(cpuset_buffer_lock);
261	261
262	262	/*
	263	+ * CPU / memory hotplug is handled asynchronously.
	264	+ */
	265	+static void cpuset_hotplug_workfn(struct work_struct *work);
	266	+
	267	+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
	268	+
	269	+/*
263	270	* This is ugly, but preserves the userspace API for existing cpuset
264	271	* users. If someone tries to mount the "cpuset" filesystem, we
265	272	* silently switch it to mount "cgroup" instead
...	...	@@ -1565,6 +1572,19 @@
1565	1572	struct cpuset *cs = cgroup_cs(cgrp);
1566	1573	struct cpuset *trialcs;
1567	1574
	1575	+ /*
	1576	+ * CPU or memory hotunplug may leave @cs w/o any execution
	1577	+ * resources, in which case the hotplug code asynchronously updates
	1578	+ * configuration and transfers all tasks to the nearest ancestor
	1579	+ * which can execute.
	1580	+ *
	1581	+ * As writes to "cpus" or "mems" may restore @cs's execution
	1582	+ * resources, wait for the previously scheduled operations before
	1583	+ * proceeding, so that we don't end up keep removing tasks added
	1584	+ * after execution capability is restored.
	1585	+ */
	1586	+ flush_work(&cpuset_hotplug_work);
	1587	+
1568	1588	if (!cgroup_lock_live_group(cgrp))
1569	1589	return -ENODEV;
1570	1590
...	...	@@ -2095,7 +2115,7 @@
2095	2115	}
2096	2116
2097	2117	/**
2098		- * cpuset_handle_hotplug - handle CPU/memory hot[un]plug
	2118	+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2099	2119	*
2100	2120	* This function is called after either CPU or memory configuration has
2101	2121	* changed and updates cpuset accordingly. The top_cpuset is always
...	...	@@ -2110,7 +2130,7 @@
2110	2130	* Note that CPU offlining during suspend is ignored. We don't modify
2111	2131	* cpusets across suspend/resume cycles at all.
2112	2132	*/
2113		-static void cpuset_handle_hotplug(void)
	2133	+static void cpuset_hotplug_workfn(struct work_struct *work)
2114	2134	{
2115	2135	static cpumask_t new_cpus, tmp_cpus;
2116	2136	static nodemask_t new_mems, tmp_mems;
...	...	@@ -2177,7 +2197,18 @@
2177	2197
2178	2198	void cpuset_update_active_cpus(bool cpu_online)
2179	2199	{
2180		- cpuset_handle_hotplug();
	2200	+ /*
	2201	+ * We're inside cpu hotplug critical region which usually nests
	2202	+ * inside cgroup synchronization. Bounce actual hotplug processing
	2203	+ * to a work item to avoid reverse locking order.
	2204	+ *
	2205	+ * We still need to do partition_sched_domains() synchronously;
	2206	+ * otherwise, the scheduler will get confused and put tasks to the
	2207	+ * dead CPU. Fall back to the default single domain.
	2208	+ * cpuset_hotplug_workfn() will rebuild it as necessary.
	2209	+ */
	2210	+ partition_sched_domains(1, NULL, NULL);
	2211	+ schedule_work(&cpuset_hotplug_work);
2181	2212	}
2182	2213
2183	2214	#ifdef CONFIG_MEMORY_HOTPLUG
...	...	@@ -2189,7 +2220,7 @@
2189	2220	static int cpuset_track_online_nodes(struct notifier_block *self,
2190	2221	unsigned long action, void *arg)
2191	2222	{
2192		- cpuset_handle_hotplug();
	2223	+ schedule_work(&cpuset_hotplug_work);
2193	2224	return NOTIFY_OK;
2194	2225	}
2195	2226	#endif