Commit dfe076b0971a783469bc2066e85d46e23c8acb1c
Committed by
Linus Torvalds
1 parent
043d18b1e5
Exists in
master
and in
4 other branches
memcg: fix deadlock between cpuset and memcg
Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Ben Blum <bblum@andrew.cmu.edu> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Paul Menage <menage@google.com> Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 49 additions and 35 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -292,7 +292,6 @@ |
292 | 292 | unsigned long moved_charge; |
293 | 293 | unsigned long moved_swap; |
294 | 294 | struct task_struct *moving_task; /* a task moving charges */ |
295 | - struct mm_struct *mm; | |
296 | 295 | wait_queue_head_t waitq; /* a waitq for other context */ |
297 | 296 | } mc = { |
298 | 297 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
... | ... | @@ -4681,7 +4680,7 @@ |
4681 | 4680 | unsigned long precharge; |
4682 | 4681 | struct vm_area_struct *vma; |
4683 | 4682 | |
4684 | - /* We've already held the mmap_sem */ | |
4683 | + down_read(&mm->mmap_sem); | |
4685 | 4684 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4686 | 4685 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4687 | 4686 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
... | ... | @@ -4693,6 +4692,7 @@ |
4693 | 4692 | walk_page_range(vma->vm_start, vma->vm_end, |
4694 | 4693 | &mem_cgroup_count_precharge_walk); |
4695 | 4694 | } |
4695 | + up_read(&mm->mmap_sem); | |
4696 | 4696 | |
4697 | 4697 | precharge = mc.precharge; |
4698 | 4698 | mc.precharge = 0; |
4699 | 4699 | |
... | ... | @@ -4702,10 +4702,15 @@ |
4702 | 4702 | |
4703 | 4703 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4704 | 4704 | { |
4705 | - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | |
4705 | + unsigned long precharge = mem_cgroup_count_precharge(mm); | |
4706 | + | |
4707 | + VM_BUG_ON(mc.moving_task); | |
4708 | + mc.moving_task = current; | |
4709 | + return mem_cgroup_do_precharge(precharge); | |
4706 | 4710 | } |
4707 | 4711 | |
4708 | -static void mem_cgroup_clear_mc(void) | |
4712 | +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ | |
4713 | +static void __mem_cgroup_clear_mc(void) | |
4709 | 4714 | { |
4710 | 4715 | struct mem_cgroup *from = mc.from; |
4711 | 4716 | struct mem_cgroup *to = mc.to; |
4712 | 4717 | |
4713 | 4718 | |
4714 | 4719 | |
... | ... | @@ -4740,23 +4745,28 @@ |
4740 | 4745 | PAGE_SIZE * mc.moved_swap); |
4741 | 4746 | } |
4742 | 4747 | /* we've already done mem_cgroup_get(mc.to) */ |
4743 | - | |
4744 | 4748 | mc.moved_swap = 0; |
4745 | 4749 | } |
4746 | - if (mc.mm) { | |
4747 | - up_read(&mc.mm->mmap_sem); | |
4748 | - mmput(mc.mm); | |
4749 | - } | |
4750 | + memcg_oom_recover(from); | |
4751 | + memcg_oom_recover(to); | |
4752 | + wake_up_all(&mc.waitq); | |
4753 | +} | |
4754 | + | |
4755 | +static void mem_cgroup_clear_mc(void) | |
4756 | +{ | |
4757 | + struct mem_cgroup *from = mc.from; | |
4758 | + | |
4759 | + /* | |
4760 | + * we must clear moving_task before waking up waiters at the end of | |
4761 | + * task migration. | |
4762 | + */ | |
4763 | + mc.moving_task = NULL; | |
4764 | + __mem_cgroup_clear_mc(); | |
4750 | 4765 | spin_lock(&mc.lock); |
4751 | 4766 | mc.from = NULL; |
4752 | 4767 | mc.to = NULL; |
4753 | 4768 | spin_unlock(&mc.lock); |
4754 | - mc.moving_task = NULL; | |
4755 | - mc.mm = NULL; | |
4756 | 4769 | mem_cgroup_end_move(from); |
4757 | - memcg_oom_recover(from); | |
4758 | - memcg_oom_recover(to); | |
4759 | - wake_up_all(&mc.waitq); | |
4760 | 4770 | } |
4761 | 4771 | |
4762 | 4772 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
4763 | 4773 | |
4764 | 4774 | |
4765 | 4775 | |
4766 | 4776 | |
... | ... | @@ -4778,38 +4788,23 @@ |
4778 | 4788 | return 0; |
4779 | 4789 | /* We move charges only when we move a owner of the mm */ |
4780 | 4790 | if (mm->owner == p) { |
4781 | - /* | |
4782 | - * We do all the move charge works under one mmap_sem to | |
4783 | - * avoid deadlock with down_write(&mmap_sem) | |
4784 | - * -> try_charge() -> if (mc.moving_task) -> sleep. | |
4785 | - */ | |
4786 | - down_read(&mm->mmap_sem); | |
4787 | - | |
4788 | 4791 | VM_BUG_ON(mc.from); |
4789 | 4792 | VM_BUG_ON(mc.to); |
4790 | 4793 | VM_BUG_ON(mc.precharge); |
4791 | 4794 | VM_BUG_ON(mc.moved_charge); |
4792 | 4795 | VM_BUG_ON(mc.moved_swap); |
4793 | - VM_BUG_ON(mc.moving_task); | |
4794 | - VM_BUG_ON(mc.mm); | |
4795 | - | |
4796 | 4796 | mem_cgroup_start_move(from); |
4797 | 4797 | spin_lock(&mc.lock); |
4798 | 4798 | mc.from = from; |
4799 | 4799 | mc.to = mem; |
4800 | - mc.precharge = 0; | |
4801 | - mc.moved_charge = 0; | |
4802 | - mc.moved_swap = 0; | |
4803 | 4800 | spin_unlock(&mc.lock); |
4804 | - mc.moving_task = current; | |
4805 | - mc.mm = mm; | |
4801 | + /* We set mc.moving_task later */ | |
4806 | 4802 | |
4807 | 4803 | ret = mem_cgroup_precharge_mc(mm); |
4808 | 4804 | if (ret) |
4809 | 4805 | mem_cgroup_clear_mc(); |
4810 | - /* We call up_read() and mmput() in clear_mc(). */ | |
4811 | - } else | |
4812 | - mmput(mm); | |
4806 | + } | |
4807 | + mmput(mm); | |
4813 | 4808 | } |
4814 | 4809 | return ret; |
4815 | 4810 | } |
... | ... | @@ -4898,7 +4893,19 @@ |
4898 | 4893 | struct vm_area_struct *vma; |
4899 | 4894 | |
4900 | 4895 | lru_add_drain_all(); |
4901 | - /* We've already held the mmap_sem */ | |
4896 | +retry: | |
4897 | + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | |
4898 | + /* | |
4899 | + * Someone who are holding the mmap_sem might be waiting in | |
4900 | + * waitq. So we cancel all extra charges, wake up all waiters, | |
4901 | + * and retry. Because we cancel precharges, we might not be able | |
4902 | + * to move enough charges, but moving charge is a best-effort | |
4903 | + * feature anyway, so it wouldn't be a big problem. | |
4904 | + */ | |
4905 | + __mem_cgroup_clear_mc(); | |
4906 | + cond_resched(); | |
4907 | + goto retry; | |
4908 | + } | |
4902 | 4909 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4903 | 4910 | int ret; |
4904 | 4911 | struct mm_walk mem_cgroup_move_charge_walk = { |
... | ... | @@ -4917,6 +4924,7 @@ |
4917 | 4924 | */ |
4918 | 4925 | break; |
4919 | 4926 | } |
4927 | + up_read(&mm->mmap_sem); | |
4920 | 4928 | } |
4921 | 4929 | |
4922 | 4930 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
4923 | 4931 | |
... | ... | @@ -4925,11 +4933,17 @@ |
4925 | 4933 | struct task_struct *p, |
4926 | 4934 | bool threadgroup) |
4927 | 4935 | { |
4928 | - if (!mc.mm) | |
4936 | + struct mm_struct *mm; | |
4937 | + | |
4938 | + if (!mc.to) | |
4929 | 4939 | /* no need to move charge */ |
4930 | 4940 | return; |
4931 | 4941 | |
4932 | - mem_cgroup_move_charge(mc.mm); | |
4942 | + mm = get_task_mm(p); | |
4943 | + if (mm) { | |
4944 | + mem_cgroup_move_charge(mm); | |
4945 | + mmput(mm); | |
4946 | + } | |
4933 | 4947 | mem_cgroup_clear_mc(); |
4934 | 4948 | } |
4935 | 4949 | #else /* !CONFIG_MMU */ |