memcg: fix deadlock between cpuset and memcg

Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Ben Blum <bblum@andrew.cmu.edu> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Paul Menage <menage@google.com> Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

memcg: fix deadlock between cpuset and memcg
Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Ben Blum <bblum@andrew.cmu.edu> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Paul Menage <menage@google.com> Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Daisuke Nishimura · Linus Torvalds
1 parent 043d18b1e5
Showing 1 changed file with 49 additions and 35 deletions Side-by-side Diff
mm/memcontrol.c
@@ -292,7 +292,6 @@
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
-	struct mm_struct *mm;
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -4681,7 +4680,7 @@
 	unsigned long precharge;
 	struct vm_area_struct *vma;
  
-	/* We've already held the mmap_sem */
+	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4693,6 +4692,7 @@
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
+	up_read(&mm->mmap_sem);
  
 	precharge = mc.precharge;
 	mc.precharge = 0;
  
@@ -4702,10 +4702,15 @@
  
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-	return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+	unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+	VM_BUG_ON(mc.moving_task);
+	mc.moving_task = current;
+	return mem_cgroup_do_precharge(precharge);
 }
  
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
  
  
  
@@ -4740,23 +4745,28 @@
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
-
 		mc.moved_swap = 0;
 	}
-	if (mc.mm) {
-		up_read(&mc.mm->mmap_sem);
-		mmput(mc.mm);
-	}
+	memcg_oom_recover(from);
+	memcg_oom_recover(to);
+	wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+	struct mem_cgroup *from = mc.from;
+
+	/*
+	 * we must clear moving_task before waking up waiters at the end of
+	 * task migration.
+	 */
+	mc.moving_task = NULL;
+	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
-	mc.moving_task = NULL;
-	mc.mm = NULL;
 	mem_cgroup_end_move(from);
-	memcg_oom_recover(from);
-	memcg_oom_recover(to);
-	wake_up_all(&mc.waitq);
 }
  
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
  
  
  
  
@@ -4778,38 +4788,23 @@
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
-			/*
-			 * We do all the move charge works under one mmap_sem to
-			 * avoid deadlock with down_write(&mmap_sem)
-			 * -> try_charge() -> if (mc.moving_task) -> sleep.
-			 */
-			down_read(&mm->mmap_sem);
-
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
-			VM_BUG_ON(mc.moving_task);
-			VM_BUG_ON(mc.mm);
-
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = mem;
-			mc.precharge = 0;
-			mc.moved_charge = 0;
-			mc.moved_swap = 0;
 			spin_unlock(&mc.lock);
-			mc.moving_task = current;
-			mc.mm = mm;
+			/* We set mc.moving_task later */
  
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
-			/* We call up_read() and mmput() in clear_mc(). */
-		} else
-			mmput(mm);
+		}
+		mmput(mm);
 	}
 	return ret;
 }
@@ -4898,7 +4893,19 @@
 	struct vm_area_struct *vma;
  
 	lru_add_drain_all();
-	/* We've already held the mmap_sem */
+retry:
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+		/*
+		 * Someone who are holding the mmap_sem might be waiting in
+		 * waitq. So we cancel all extra charges, wake up all waiters,
+		 * and retry. Because we cancel precharges, we might not be able
+		 * to move enough charges, but moving charge is a best-effort
+		 * feature anyway, so it wouldn't be a big problem.
+		 */
+		__mem_cgroup_clear_mc();
+		cond_resched();
+		goto retry;
+	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4917,6 +4924,7 @@
 			 */
 			break;
 	}
+	up_read(&mm->mmap_sem);
 }
  
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  
@@ -4925,11 +4933,17 @@
 				struct task_struct *p,
 				bool threadgroup)
 {
-	if (!mc.mm)
+	struct mm_struct *mm;
+
+	if (!mc.to)
 		/* no need to move charge */
 		return;
  
-	mem_cgroup_move_charge(mc.mm);
+	mm = get_task_mm(p);
+	if (mm) {
+		mem_cgroup_move_charge(mm);
+		mmput(mm);
+	}
 	mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
...	...	@@ -292,7 +292,6 @@
292	292	unsigned long moved_charge;
293	293	unsigned long moved_swap;
294	294	struct task_struct moving_task; / a task moving charges */
295		- struct mm_struct *mm;
296	295	wait_queue_head_t waitq; /* a waitq for other context */
297	296	} mc = {
298	297	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
...	...	@@ -4681,7 +4680,7 @@
4681	4680	unsigned long precharge;
4682	4681	struct vm_area_struct *vma;
4683	4682
4684		- /* We've already held the mmap_sem */
	4683	+ down_read(&mm->mmap_sem);
4685	4684	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4686	4685	struct mm_walk mem_cgroup_count_precharge_walk = {
4687	4686	.pmd_entry = mem_cgroup_count_precharge_pte_range,
...	...	@@ -4693,6 +4692,7 @@
4693	4692	walk_page_range(vma->vm_start, vma->vm_end,
4694	4693	&mem_cgroup_count_precharge_walk);
4695	4694	}
	4695	+ up_read(&mm->mmap_sem);
4696	4696
4697	4697	precharge = mc.precharge;
4698	4698	mc.precharge = 0;
4699	4699
...	...	@@ -4702,10 +4702,15 @@
4702	4702
4703	4703	static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4704	4704	{
4705		- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
	4705	+ unsigned long precharge = mem_cgroup_count_precharge(mm);
	4706	+
	4707	+ VM_BUG_ON(mc.moving_task);
	4708	+ mc.moving_task = current;
	4709	+ return mem_cgroup_do_precharge(precharge);
4706	4710	}
4707	4711
4708		-static void mem_cgroup_clear_mc(void)
	4712	+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
	4713	+static void __mem_cgroup_clear_mc(void)
4709	4714	{
4710	4715	struct mem_cgroup *from = mc.from;
4711	4716	struct mem_cgroup *to = mc.to;
4712	4717
4713	4718
4714	4719
...	...	@@ -4740,23 +4745,28 @@
4740	4745	PAGE_SIZE * mc.moved_swap);
4741	4746	}
4742	4747	/* we've already done mem_cgroup_get(mc.to) */
4743		-
4744	4748	mc.moved_swap = 0;
4745	4749	}
4746		- if (mc.mm) {
4747		- up_read(&mc.mm->mmap_sem);
4748		- mmput(mc.mm);
4749		- }
	4750	+ memcg_oom_recover(from);
	4751	+ memcg_oom_recover(to);
	4752	+ wake_up_all(&mc.waitq);
	4753	+}
	4754	+
	4755	+static void mem_cgroup_clear_mc(void)
	4756	+{
	4757	+ struct mem_cgroup *from = mc.from;
	4758	+
	4759	+ /*
	4760	+ * we must clear moving_task before waking up waiters at the end of
	4761	+ * task migration.
	4762	+ */
	4763	+ mc.moving_task = NULL;
	4764	+ __mem_cgroup_clear_mc();
4750	4765	spin_lock(&mc.lock);
4751	4766	mc.from = NULL;
4752	4767	mc.to = NULL;
4753	4768	spin_unlock(&mc.lock);
4754		- mc.moving_task = NULL;
4755		- mc.mm = NULL;
4756	4769	mem_cgroup_end_move(from);
4757		- memcg_oom_recover(from);
4758		- memcg_oom_recover(to);
4759		- wake_up_all(&mc.waitq);
4760	4770	}
4761	4771
4762	4772	static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4763	4773
4764	4774
4765	4775
4766	4776
...	...	@@ -4778,38 +4788,23 @@
4778	4788	return 0;
4779	4789	/* We move charges only when we move a owner of the mm */
4780	4790	if (mm->owner == p) {
4781		- /*
4782		- * We do all the move charge works under one mmap_sem to
4783		- * avoid deadlock with down_write(&mmap_sem)
4784		- * -> try_charge() -> if (mc.moving_task) -> sleep.
4785		- */
4786		- down_read(&mm->mmap_sem);
4787		-
4788	4791	VM_BUG_ON(mc.from);
4789	4792	VM_BUG_ON(mc.to);
4790	4793	VM_BUG_ON(mc.precharge);
4791	4794	VM_BUG_ON(mc.moved_charge);
4792	4795	VM_BUG_ON(mc.moved_swap);
4793		- VM_BUG_ON(mc.moving_task);
4794		- VM_BUG_ON(mc.mm);
4795		-
4796	4796	mem_cgroup_start_move(from);
4797	4797	spin_lock(&mc.lock);
4798	4798	mc.from = from;
4799	4799	mc.to = mem;
4800		- mc.precharge = 0;
4801		- mc.moved_charge = 0;
4802		- mc.moved_swap = 0;
4803	4800	spin_unlock(&mc.lock);
4804		- mc.moving_task = current;
4805		- mc.mm = mm;
	4801	+ /* We set mc.moving_task later */
4806	4802
4807	4803	ret = mem_cgroup_precharge_mc(mm);
4808	4804	if (ret)
4809	4805	mem_cgroup_clear_mc();
4810		- /* We call up_read() and mmput() in clear_mc(). */
4811		- } else
4812		- mmput(mm);
	4806	+ }
	4807	+ mmput(mm);
4813	4808	}
4814	4809	return ret;
4815	4810	}
...	...	@@ -4898,7 +4893,19 @@
4898	4893	struct vm_area_struct *vma;
4899	4894
4900	4895	lru_add_drain_all();
4901		- /* We've already held the mmap_sem */
	4896	+retry:
	4897	+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
	4898	+ /*
	4899	+ * Someone who are holding the mmap_sem might be waiting in
	4900	+ * waitq. So we cancel all extra charges, wake up all waiters,
	4901	+ * and retry. Because we cancel precharges, we might not be able
	4902	+ * to move enough charges, but moving charge is a best-effort
	4903	+ * feature anyway, so it wouldn't be a big problem.
	4904	+ */
	4905	+ __mem_cgroup_clear_mc();
	4906	+ cond_resched();
	4907	+ goto retry;
	4908	+ }
4902	4909	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4903	4910	int ret;
4904	4911	struct mm_walk mem_cgroup_move_charge_walk = {
...	...	@@ -4917,6 +4924,7 @@
4917	4924	*/
4918	4925	break;
4919	4926	}
	4927	+ up_read(&mm->mmap_sem);
4920	4928	}
4921	4929
4922	4930	static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4923	4931
...	...	@@ -4925,11 +4933,17 @@
4925	4933	struct task_struct *p,
4926	4934	bool threadgroup)
4927	4935	{
4928		- if (!mc.mm)
	4936	+ struct mm_struct *mm;
	4937	+
	4938	+ if (!mc.to)
4929	4939	/* no need to move charge */
4930	4940	return;
4931	4941
4932		- mem_cgroup_move_charge(mc.mm);
	4942	+ mm = get_task_mm(p);
	4943	+ if (mm) {
	4944	+ mem_cgroup_move_charge(mm);
	4945	+ mmput(mm);
	4946	+ }
4933	4947	mem_cgroup_clear_mc();
4934	4948	}
4935	4949	#else /* !CONFIG_MMU */