Commit dfe076b0971a783469bc2066e85d46e23c8acb1c

Authored by Daisuke Nishimura
Committed by Linus Torvalds
1 parent 043d18b1e5

memcg: fix deadlock between cpuset and memcg

Commit b1dd693e ("memcg: avoid deadlock between move charge and
try_charge()") can cause another deadlock about mmap_sem on task migration
if cpuset and memcg are mounted onto the same mount point.

After the commit, cgroup_attach_task() has sequence like:

cgroup_attach_task()
  ss->can_attach()
    cpuset_can_attach()
    mem_cgroup_can_attach()
      down_read(&mmap_sem)        (1)
  ss->attach()
    cpuset_attach()
      mpol_rebind_mm()
        down_write(&mmap_sem)     (2)
        up_write(&mmap_sem)
      cpuset_migrate_mm()
        do_migrate_pages()
          down_read(&mmap_sem)
          up_read(&mmap_sem)
    mem_cgroup_move_task()
      mem_cgroup_clear_mc()
        up_read(&mmap_sem)

We can cause deadlock at (2) because we've already aquire the mmap_sem at (1).

But the commit itself is necessary to fix deadlocks which have existed
before the commit like:

Ex.1)
                move charge             |        try charge
  --------------------------------------+------------------------------
    mem_cgroup_can_attach()             |  down_write(&mmap_sem)
      mc.moving_task = current          |    ..
      mem_cgroup_precharge_mc()         |  __mem_cgroup_try_charge()
        mem_cgroup_count_precharge()    |    prepare_to_wait()
          down_read(&mmap_sem)          |    if (mc.moving_task)
          -> cannot aquire the lock     |    -> true
                                        |      schedule()
                                        |      -> move charge should wake it up

Ex.2)
                move charge             |        try charge
  --------------------------------------+------------------------------
    mem_cgroup_can_attach()             |
      mc.moving_task = current          |
      mem_cgroup_precharge_mc()         |
        mem_cgroup_count_precharge()    |
          down_read(&mmap_sem)          |
          ..                            |
          up_read(&mmap_sem)            |
                                        |  down_write(&mmap_sem)
    mem_cgroup_move_task()              |    ..
      mem_cgroup_move_charge()          |  __mem_cgroup_try_charge()
        down_read(&mmap_sem)            |    prepare_to_wait()
        -> cannot aquire the lock       |    if (mc.moving_task)
                                        |    -> true
                                        |      schedule()
                                        |      -> move charge should wake it up

This patch fixes all of these problems by:
1. revert the commit.
2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge()
   has released the mmap_sem.
3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in
   mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel
   all extra charges, wake up all waiters, and retry trylock.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 49 additions and 35 deletions Side-by-side Diff

... ... @@ -292,7 +292,6 @@
292 292 unsigned long moved_charge;
293 293 unsigned long moved_swap;
294 294 struct task_struct *moving_task; /* a task moving charges */
295   - struct mm_struct *mm;
296 295 wait_queue_head_t waitq; /* a waitq for other context */
297 296 } mc = {
298 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
... ... @@ -4681,7 +4680,7 @@
4681 4680 unsigned long precharge;
4682 4681 struct vm_area_struct *vma;
4683 4682  
4684   - /* We've already held the mmap_sem */
  4683 + down_read(&mm->mmap_sem);
4685 4684 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4686 4685 struct mm_walk mem_cgroup_count_precharge_walk = {
4687 4686 .pmd_entry = mem_cgroup_count_precharge_pte_range,
... ... @@ -4693,6 +4692,7 @@
4693 4692 walk_page_range(vma->vm_start, vma->vm_end,
4694 4693 &mem_cgroup_count_precharge_walk);
4695 4694 }
  4695 + up_read(&mm->mmap_sem);
4696 4696  
4697 4697 precharge = mc.precharge;
4698 4698 mc.precharge = 0;
4699 4699  
... ... @@ -4702,10 +4702,15 @@
4702 4702  
4703 4703 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4704 4704 {
4705   - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
  4705 + unsigned long precharge = mem_cgroup_count_precharge(mm);
  4706 +
  4707 + VM_BUG_ON(mc.moving_task);
  4708 + mc.moving_task = current;
  4709 + return mem_cgroup_do_precharge(precharge);
4706 4710 }
4707 4711  
4708   -static void mem_cgroup_clear_mc(void)
  4712 +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  4713 +static void __mem_cgroup_clear_mc(void)
4709 4714 {
4710 4715 struct mem_cgroup *from = mc.from;
4711 4716 struct mem_cgroup *to = mc.to;
4712 4717  
4713 4718  
4714 4719  
... ... @@ -4740,23 +4745,28 @@
4740 4745 PAGE_SIZE * mc.moved_swap);
4741 4746 }
4742 4747 /* we've already done mem_cgroup_get(mc.to) */
4743   -
4744 4748 mc.moved_swap = 0;
4745 4749 }
4746   - if (mc.mm) {
4747   - up_read(&mc.mm->mmap_sem);
4748   - mmput(mc.mm);
4749   - }
  4750 + memcg_oom_recover(from);
  4751 + memcg_oom_recover(to);
  4752 + wake_up_all(&mc.waitq);
  4753 +}
  4754 +
  4755 +static void mem_cgroup_clear_mc(void)
  4756 +{
  4757 + struct mem_cgroup *from = mc.from;
  4758 +
  4759 + /*
  4760 + * we must clear moving_task before waking up waiters at the end of
  4761 + * task migration.
  4762 + */
  4763 + mc.moving_task = NULL;
  4764 + __mem_cgroup_clear_mc();
4750 4765 spin_lock(&mc.lock);
4751 4766 mc.from = NULL;
4752 4767 mc.to = NULL;
4753 4768 spin_unlock(&mc.lock);
4754   - mc.moving_task = NULL;
4755   - mc.mm = NULL;
4756 4769 mem_cgroup_end_move(from);
4757   - memcg_oom_recover(from);
4758   - memcg_oom_recover(to);
4759   - wake_up_all(&mc.waitq);
4760 4770 }
4761 4771  
4762 4772 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4763 4773  
4764 4774  
4765 4775  
4766 4776  
... ... @@ -4778,38 +4788,23 @@
4778 4788 return 0;
4779 4789 /* We move charges only when we move a owner of the mm */
4780 4790 if (mm->owner == p) {
4781   - /*
4782   - * We do all the move charge works under one mmap_sem to
4783   - * avoid deadlock with down_write(&mmap_sem)
4784   - * -> try_charge() -> if (mc.moving_task) -> sleep.
4785   - */
4786   - down_read(&mm->mmap_sem);
4787   -
4788 4791 VM_BUG_ON(mc.from);
4789 4792 VM_BUG_ON(mc.to);
4790 4793 VM_BUG_ON(mc.precharge);
4791 4794 VM_BUG_ON(mc.moved_charge);
4792 4795 VM_BUG_ON(mc.moved_swap);
4793   - VM_BUG_ON(mc.moving_task);
4794   - VM_BUG_ON(mc.mm);
4795   -
4796 4796 mem_cgroup_start_move(from);
4797 4797 spin_lock(&mc.lock);
4798 4798 mc.from = from;
4799 4799 mc.to = mem;
4800   - mc.precharge = 0;
4801   - mc.moved_charge = 0;
4802   - mc.moved_swap = 0;
4803 4800 spin_unlock(&mc.lock);
4804   - mc.moving_task = current;
4805   - mc.mm = mm;
  4801 + /* We set mc.moving_task later */
4806 4802  
4807 4803 ret = mem_cgroup_precharge_mc(mm);
4808 4804 if (ret)
4809 4805 mem_cgroup_clear_mc();
4810   - /* We call up_read() and mmput() in clear_mc(). */
4811   - } else
4812   - mmput(mm);
  4806 + }
  4807 + mmput(mm);
4813 4808 }
4814 4809 return ret;
4815 4810 }
... ... @@ -4898,7 +4893,19 @@
4898 4893 struct vm_area_struct *vma;
4899 4894  
4900 4895 lru_add_drain_all();
4901   - /* We've already held the mmap_sem */
  4896 +retry:
  4897 + if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  4898 + /*
  4899 + * Someone who are holding the mmap_sem might be waiting in
  4900 + * waitq. So we cancel all extra charges, wake up all waiters,
  4901 + * and retry. Because we cancel precharges, we might not be able
  4902 + * to move enough charges, but moving charge is a best-effort
  4903 + * feature anyway, so it wouldn't be a big problem.
  4904 + */
  4905 + __mem_cgroup_clear_mc();
  4906 + cond_resched();
  4907 + goto retry;
  4908 + }
4902 4909 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4903 4910 int ret;
4904 4911 struct mm_walk mem_cgroup_move_charge_walk = {
... ... @@ -4917,6 +4924,7 @@
4917 4924 */
4918 4925 break;
4919 4926 }
  4927 + up_read(&mm->mmap_sem);
4920 4928 }
4921 4929  
4922 4930 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4923 4931  
... ... @@ -4925,11 +4933,17 @@
4925 4933 struct task_struct *p,
4926 4934 bool threadgroup)
4927 4935 {
4928   - if (!mc.mm)
  4936 + struct mm_struct *mm;
  4937 +
  4938 + if (!mc.to)
4929 4939 /* no need to move charge */
4930 4940 return;
4931 4941  
4932   - mem_cgroup_move_charge(mc.mm);
  4942 + mm = get_task_mm(p);
  4943 + if (mm) {
  4944 + mem_cgroup_move_charge(mm);
  4945 + mmput(mm);
  4946 + }
4933 4947 mem_cgroup_clear_mc();
4934 4948 }
4935 4949 #else /* !CONFIG_MMU */