Revert "memcg: get rid of percpu_charge_mutex lock"

This reverts commit 8521fc50d433507a7cdc96bec280f9e5888a54cc. The patch incorrectly assumes that using atomic FLUSHING_CACHED_CHARGE bit operations is sufficient but that is not true. Johannes Weiner has reported a crash during parallel memory cgroup removal: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [<ffffffff81083b70>] css_is_ancestor+0x20/0x70 Oops: 0000 [#1] PREEMPT SMP Pid: 19677, comm: rmdir Tainted: G W 3.0.0-mm1-00188-gf38d32b #35 ECS MCP61M-M3/MCP61M-M3 RIP: 0010:[<ffffffff81083b70>] css_is_ancestor+0x20/0x70 RSP: 0018:ffff880077b09c88 EFLAGS: 00010202 Process rmdir (pid: 19677, threadinfo ffff880077b08000, task ffff8800781bb310) Call Trace: [<ffffffff810feba3>] mem_cgroup_same_or_subtree+0x33/0x40 [<ffffffff810feccf>] drain_all_stock+0x11f/0x170 [<ffffffff81103211>] mem_cgroup_force_empty+0x231/0x6d0 [<ffffffff811036c4>] mem_cgroup_pre_destroy+0x14/0x20 [<ffffffff81080559>] cgroup_rmdir+0xb9/0x500 [<ffffffff81114d26>] vfs_rmdir+0x86/0xe0 [<ffffffff81114e7b>] do_rmdir+0xfb/0x110 [<ffffffff81114ea6>] sys_rmdir+0x16/0x20 [<ffffffff8154d76b>] system_call_fastpath+0x16/0x1b We are crashing because we try to dereference cached memcg when we are checking whether we should wait for draining on the cache. The cache is already cleaned up, though. There is also a theoretical chance that the cached memcg gets freed between we test for the FLUSHING_CACHED_CHARGE and dereference it in mem_cgroup_same_or_subtree: CPU0 CPU1 CPU2 mem=stock->cached stock->cached=NULL clear_bit test_and_set_bit test_bit() ... <preempted> mem_cgroup_destroy use after free The percpu_charge_mutex protected from this race because sync draining is exclusive. It is safer to revert now and come up with a more parallel implementation later. Signed-off-by: Michal Hocko <mhocko@suse.cz> Reported-by: Johannes Weiner <jweiner@redhat.com> Acked-by: Johannes Weiner <jweiner@redhat.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Revert "memcg: get rid of percpu_charge_mutex lock"
This reverts commit 8521fc50d433507a7cdc96bec280f9e5888a54cc. The patch incorrectly assumes that using atomic FLUSHING_CACHED_CHARGE bit operations is sufficient but that is not true. Johannes Weiner has reported a crash during parallel memory cgroup removal: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [<ffffffff81083b70>] css_is_ancestor+0x20/0x70 Oops: 0000 [#1] PREEMPT SMP Pid: 19677, comm: rmdir Tainted: G W 3.0.0-mm1-00188-gf38d32b #35 ECS MCP61M-M3/MCP61M-M3 RIP: 0010:[<ffffffff81083b70>] css_is_ancestor+0x20/0x70 RSP: 0018:ffff880077b09c88 EFLAGS: 00010202 Process rmdir (pid: 19677, threadinfo ffff880077b08000, task ffff8800781bb310) Call Trace: [<ffffffff810feba3>] mem_cgroup_same_or_subtree+0x33/0x40 [<ffffffff810feccf>] drain_all_stock+0x11f/0x170 [<ffffffff81103211>] mem_cgroup_force_empty+0x231/0x6d0 [<ffffffff811036c4>] mem_cgroup_pre_destroy+0x14/0x20 [<ffffffff81080559>] cgroup_rmdir+0xb9/0x500 [<ffffffff81114d26>] vfs_rmdir+0x86/0xe0 [<ffffffff81114e7b>] do_rmdir+0xfb/0x110 [<ffffffff81114ea6>] sys_rmdir+0x16/0x20 [<ffffffff8154d76b>] system_call_fastpath+0x16/0x1b We are crashing because we try to dereference cached memcg when we are checking whether we should wait for draining on the cache. The cache is already cleaned up, though. There is also a theoretical chance that the cached memcg gets freed between we test for the FLUSHING_CACHED_CHARGE and dereference it in mem_cgroup_same_or_subtree: CPU0 CPU1 CPU2 mem=stock->cached stock->cached=NULL clear_bit test_and_set_bit test_bit() ... <preempted> mem_cgroup_destroy use after free The percpu_charge_mutex protected from this race because sync draining is exclusive. It is safer to revert now and come up with a more parallel implementation later. Signed-off-by: Michal Hocko <mhocko@suse.cz> Reported-by: Johannes Weiner <jweiner@redhat.com> Acked-by: Johannes Weiner <jweiner@redhat.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Michal Hocko · Linus Torvalds
1 parent 47e180d652
Showing 1 changed file with 10 additions and 2 deletions Side-by-side Diff
mm/memcontrol.c
@@ -2091,6 +2091,7 @@
 #define FLUSHING_CACHED_CHARGE	(0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_MUTEX(percpu_charge_mutex);
  
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2197,8 +2198,7 @@
  
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-		if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
-				test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
  
  
  
@@ -2213,14 +2213,22 @@
  */
 static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
+	/*
+	 * If someone calls draining, avoid adding more kworker runs.
+	 */
+	if (!mutex_trylock(&percpu_charge_mutex))
+		return;
 	drain_all_stock(root_mem, false);
+	mutex_unlock(&percpu_charge_mutex);
 }
  
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_mem)
 {
 	/* called when force_empty is called */
+	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_mem, true);
+	mutex_unlock(&percpu_charge_mutex);
 }
  
 /*
...	...	@@ -2091,6 +2091,7 @@
2091	2091	#define FLUSHING_CACHED_CHARGE (0)
2092	2092	};
2093	2093	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
	2094	+static DEFINE_MUTEX(percpu_charge_mutex);
2094	2095
2095	2096	/*
2096	2097	* Try to consume stocked charge on this cpu. If success, one page is consumed
...	...	@@ -2197,8 +2198,7 @@
2197	2198
2198	2199	for_each_online_cpu(cpu) {
2199	2200	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2200		- if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
2201		- test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
	2201	+ if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2202	2202	flush_work(&stock->work);
2203	2203	}
2204	2204	out:
2205	2205
2206	2206
2207	2207
...	...	@@ -2213,14 +2213,22 @@
2213	2213	*/
2214	2214	static void drain_all_stock_async(struct mem_cgroup *root_mem)
2215	2215	{
	2216	+ /*
	2217	+ * If someone calls draining, avoid adding more kworker runs.
	2218	+ */
	2219	+ if (!mutex_trylock(&percpu_charge_mutex))
	2220	+ return;
2216	2221	drain_all_stock(root_mem, false);
	2222	+ mutex_unlock(&percpu_charge_mutex);
2217	2223	}
2218	2224
2219	2225	/* This is a synchronous drain interface. */
2220	2226	static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2221	2227	{
2222	2228	/* called when force_empty is called */
	2229	+ mutex_lock(&percpu_charge_mutex);
2223	2230	drain_all_stock(root_mem, true);
	2231	+ mutex_unlock(&percpu_charge_mutex);
2224	2232	}
2225	2233
2226	2234	/*