Commit f75ca962037ffd639a44fd88933cd9b84c4c4411
Committed by
Linus Torvalds
1 parent
158e0a2d1b
Exists in
master
and in
4 other branches
memcg: avoid css_get()
Now, memory cgroup increments css(cgroup subsys state)'s reference count per a charged page. And the reference count is kept until the page is uncharged. But this has 2 bad effect. 1. Because css_get/put calls atomic_inc()/dec, heavy call of them on large smp will not scale well. 2. Because css's refcnt cannot be in a state as "ready-to-release", cgroup's notify_on_release handler can't work with memcg. 3. css's refcnt is atomic_t, it means smaller than 32bit. Maybe too small. This has been a problem since the 1st merge of memcg. This is a trial to remove css's refcnt per a page. Even if we remove refcnt, pre_destroy() does enough synchronization as - check res->usage == 0. - check no pages on LRU. This patch removes css's refcnt per page. Even after this patch, at the 1st look, it seems css_get() is still called in try_charge(). But the logic is. - If a memcg of mm->owner is cached one, consume_stock() will work. At success, return immediately. - If consume_stock returns false, css_get() is called and go to slow path which may be blocked. At the end of slow path, css_put() is called and restart from the start if necessary. So, in the fast path, we don't call css_get() and can avoid access to shared counter. This patch can make the most possible case fast. Here is a result of multi-threaded page fault benchmark. [Before] 25.32% multi-fault-all [kernel.kallsyms] [k] clear_page_c 9.30% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave 8.02% multi-fault-all [kernel.kallsyms] [k] try_get_mem_cgroup_from_mm <=====(*) 7.83% multi-fault-all [kernel.kallsyms] [k] down_read_trylock 5.38% multi-fault-all [kernel.kallsyms] [k] __css_put 5.29% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask 4.92% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq 4.24% multi-fault-all [kernel.kallsyms] [k] up_read 3.53% multi-fault-all [kernel.kallsyms] [k] css_put 2.11% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault 1.76% multi-fault-all [kernel.kallsyms] [k] __rmqueue 1.64% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge [After] 28.41% multi-fault-all [kernel.kallsyms] [k] clear_page_c 10.08% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq 9.58% multi-fault-all [kernel.kallsyms] [k] down_read_trylock 9.38% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.86% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask 5.65% multi-fault-all [kernel.kallsyms] [k] up_read 2.82% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault 2.64% multi-fault-all [kernel.kallsyms] [k] mem_cgroup_add_lru_list 2.48% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge Then, 8.02% of try_get_mem_cgroup_from_mm() disappears because this patch removes css_tryget() in it. (But yes, this is an extreme case.) Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 76 additions and 43 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -1714,28 +1714,66 @@ |
1714 | 1714 | * thread group leader migrates. It's possible that mm is not |
1715 | 1715 | * set, if so charge the init_mm (happens for pagecache usage). |
1716 | 1716 | */ |
1717 | - if (*memcg) { | |
1717 | + if (!*memcg && !mm) | |
1718 | + goto bypass; | |
1719 | +again: | |
1720 | + if (*memcg) { /* css should be a valid one */ | |
1718 | 1721 | mem = *memcg; |
1722 | + VM_BUG_ON(css_is_removed(&mem->css)); | |
1723 | + if (mem_cgroup_is_root(mem)) | |
1724 | + goto done; | |
1725 | + if (consume_stock(mem)) | |
1726 | + goto done; | |
1719 | 1727 | css_get(&mem->css); |
1720 | 1728 | } else { |
1721 | - mem = try_get_mem_cgroup_from_mm(mm); | |
1722 | - if (unlikely(!mem)) | |
1723 | - return 0; | |
1724 | - *memcg = mem; | |
1729 | + struct task_struct *p; | |
1730 | + | |
1731 | + rcu_read_lock(); | |
1732 | + p = rcu_dereference(mm->owner); | |
1733 | + VM_BUG_ON(!p); | |
1734 | + /* | |
1735 | + * because we don't have task_lock(), "p" can exit while | |
1736 | + * we're here. In that case, "mem" can point to root | |
1737 | + * cgroup but never be NULL. (and task_struct itself is freed | |
1738 | + * by RCU, cgroup itself is RCU safe.) Then, we have small | |
1739 | + * risk here to get wrong cgroup. But such kind of mis-account | |
1740 | + * by race always happens because we don't have cgroup_mutex(). | |
1741 | + * It's overkill and we allow that small race, here. | |
1742 | + */ | |
1743 | + mem = mem_cgroup_from_task(p); | |
1744 | + VM_BUG_ON(!mem); | |
1745 | + if (mem_cgroup_is_root(mem)) { | |
1746 | + rcu_read_unlock(); | |
1747 | + goto done; | |
1748 | + } | |
1749 | + if (consume_stock(mem)) { | |
1750 | + /* | |
1751 | + * It seems dagerous to access memcg without css_get(). | |
1752 | + * But considering how consume_stok works, it's not | |
1753 | + * necessary. If consume_stock success, some charges | |
1754 | + * from this memcg are cached on this cpu. So, we | |
1755 | + * don't need to call css_get()/css_tryget() before | |
1756 | + * calling consume_stock(). | |
1757 | + */ | |
1758 | + rcu_read_unlock(); | |
1759 | + goto done; | |
1760 | + } | |
1761 | + /* after here, we may be blocked. we need to get refcnt */ | |
1762 | + if (!css_tryget(&mem->css)) { | |
1763 | + rcu_read_unlock(); | |
1764 | + goto again; | |
1765 | + } | |
1766 | + rcu_read_unlock(); | |
1725 | 1767 | } |
1726 | 1768 | |
1727 | - VM_BUG_ON(css_is_removed(&mem->css)); | |
1728 | - if (mem_cgroup_is_root(mem)) | |
1729 | - goto done; | |
1730 | - | |
1731 | 1769 | do { |
1732 | 1770 | bool oom_check; |
1733 | 1771 | |
1734 | - if (consume_stock(mem)) | |
1735 | - goto done; /* don't need to fill stock */ | |
1736 | 1772 | /* If killed, bypass charge */ |
1737 | - if (fatal_signal_pending(current)) | |
1773 | + if (fatal_signal_pending(current)) { | |
1774 | + css_put(&mem->css); | |
1738 | 1775 | goto bypass; |
1776 | + } | |
1739 | 1777 | |
1740 | 1778 | oom_check = false; |
1741 | 1779 | if (oom && !nr_oom_retries) { |
1742 | 1780 | |
1743 | 1781 | |
1744 | 1782 | |
1745 | 1783 | |
1746 | 1784 | |
1747 | 1785 | |
1748 | 1786 | |
1749 | 1787 | |
... | ... | @@ -1750,30 +1788,36 @@ |
1750 | 1788 | break; |
1751 | 1789 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1752 | 1790 | csize = PAGE_SIZE; |
1753 | - break; | |
1791 | + css_put(&mem->css); | |
1792 | + mem = NULL; | |
1793 | + goto again; | |
1754 | 1794 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
1795 | + css_put(&mem->css); | |
1755 | 1796 | goto nomem; |
1756 | 1797 | case CHARGE_NOMEM: /* OOM routine works */ |
1757 | - if (!oom) | |
1798 | + if (!oom) { | |
1799 | + css_put(&mem->css); | |
1758 | 1800 | goto nomem; |
1801 | + } | |
1759 | 1802 | /* If oom, we never return -ENOMEM */ |
1760 | 1803 | nr_oom_retries--; |
1761 | 1804 | break; |
1762 | 1805 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
1806 | + css_put(&mem->css); | |
1763 | 1807 | goto bypass; |
1764 | 1808 | } |
1765 | 1809 | } while (ret != CHARGE_OK); |
1766 | 1810 | |
1767 | 1811 | if (csize > PAGE_SIZE) |
1768 | 1812 | refill_stock(mem, csize - PAGE_SIZE); |
1813 | + css_put(&mem->css); | |
1769 | 1814 | done: |
1815 | + *memcg = mem; | |
1770 | 1816 | return 0; |
1771 | 1817 | nomem: |
1772 | - css_put(&mem->css); | |
1818 | + *memcg = NULL; | |
1773 | 1819 | return -ENOMEM; |
1774 | 1820 | bypass: |
1775 | - if (mem) | |
1776 | - css_put(&mem->css); | |
1777 | 1821 | *memcg = NULL; |
1778 | 1822 | return 0; |
1779 | 1823 | } |
1780 | 1824 | |
... | ... | @@ -1790,11 +1834,7 @@ |
1790 | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
1791 | 1835 | if (do_swap_account) |
1792 | 1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
1793 | - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | |
1794 | - WARN_ON_ONCE(count > INT_MAX); | |
1795 | - __css_put(&mem->css, (int)count); | |
1796 | 1837 | } |
1797 | - /* we don't need css_put for root */ | |
1798 | 1838 | } |
1799 | 1839 | |
1800 | 1840 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) |
... | ... | @@ -2155,7 +2195,6 @@ |
2155 | 2195 | goto charge_cur_mm; |
2156 | 2196 | *ptr = mem; |
2157 | 2197 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
2158 | - /* drop extra refcnt from tryget */ | |
2159 | 2198 | css_put(&mem->css); |
2160 | 2199 | return ret; |
2161 | 2200 | charge_cur_mm: |
... | ... | @@ -2325,10 +2364,6 @@ |
2325 | 2364 | break; |
2326 | 2365 | } |
2327 | 2366 | |
2328 | - if (!mem_cgroup_is_root(mem)) | |
2329 | - __do_uncharge(mem, ctype); | |
2330 | - if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | |
2331 | - mem_cgroup_swap_statistics(mem, true); | |
2332 | 2367 | mem_cgroup_charge_statistics(mem, pc, false); |
2333 | 2368 | |
2334 | 2369 | ClearPageCgroupUsed(pc); |
2335 | 2370 | |
... | ... | @@ -2340,11 +2375,17 @@ |
2340 | 2375 | */ |
2341 | 2376 | |
2342 | 2377 | unlock_page_cgroup(pc); |
2343 | - | |
2378 | + /* | |
2379 | + * even after unlock, we have mem->res.usage here and this memcg | |
2380 | + * will never be freed. | |
2381 | + */ | |
2344 | 2382 | memcg_check_events(mem, page); |
2345 | - /* at swapout, this memcg will be accessed to record to swap */ | |
2346 | - if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | |
2347 | - css_put(&mem->css); | |
2383 | + if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { | |
2384 | + mem_cgroup_swap_statistics(mem, true); | |
2385 | + mem_cgroup_get(mem); | |
2386 | + } | |
2387 | + if (!mem_cgroup_is_root(mem)) | |
2388 | + __do_uncharge(mem, ctype); | |
2348 | 2389 | |
2349 | 2390 | return mem; |
2350 | 2391 | |
2351 | 2392 | |
... | ... | @@ -2431,13 +2472,12 @@ |
2431 | 2472 | |
2432 | 2473 | memcg = __mem_cgroup_uncharge_common(page, ctype); |
2433 | 2474 | |
2434 | - /* record memcg information */ | |
2435 | - if (do_swap_account && swapout && memcg) { | |
2475 | + /* | |
2476 | + * record memcg information, if swapout && memcg != NULL, | |
2477 | + * mem_cgroup_get() was called in uncharge(). | |
2478 | + */ | |
2479 | + if (do_swap_account && swapout && memcg) | |
2436 | 2480 | swap_cgroup_record(ent, css_id(&memcg->css)); |
2437 | - mem_cgroup_get(memcg); | |
2438 | - } | |
2439 | - if (swapout && memcg) | |
2440 | - css_put(&memcg->css); | |
2441 | 2481 | } |
2442 | 2482 | #endif |
2443 | 2483 | |
... | ... | @@ -2515,7 +2555,6 @@ |
2515 | 2555 | */ |
2516 | 2556 | if (!mem_cgroup_is_root(to)) |
2517 | 2557 | res_counter_uncharge(&to->res, PAGE_SIZE); |
2518 | - css_put(&to->css); | |
2519 | 2558 | } |
2520 | 2559 | return 0; |
2521 | 2560 | } |
... | ... | @@ -4214,9 +4253,6 @@ |
4214 | 4253 | goto one_by_one; |
4215 | 4254 | } |
4216 | 4255 | mc.precharge += count; |
4217 | - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | |
4218 | - WARN_ON_ONCE(count > INT_MAX); | |
4219 | - __css_get(&mem->css, (int)count); | |
4220 | 4256 | return ret; |
4221 | 4257 | } |
4222 | 4258 | one_by_one: |
... | ... | @@ -4452,7 +4488,6 @@ |
4452 | 4488 | } |
4453 | 4489 | /* we must fixup refcnts and charges */ |
4454 | 4490 | if (mc.moved_swap) { |
4455 | - WARN_ON_ONCE(mc.moved_swap > INT_MAX); | |
4456 | 4491 | /* uncharge swap account from the old cgroup */ |
4457 | 4492 | if (!mem_cgroup_is_root(mc.from)) |
4458 | 4493 | res_counter_uncharge(&mc.from->memsw, |
... | ... | @@ -4466,8 +4501,6 @@ |
4466 | 4501 | */ |
4467 | 4502 | res_counter_uncharge(&mc.to->res, |
4468 | 4503 | PAGE_SIZE * mc.moved_swap); |
4469 | - VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | |
4470 | - __css_put(&mc.to->css, mc.moved_swap); | |
4471 | 4504 | } |
4472 | 4505 | /* we've already done mem_cgroup_get(mc.to) */ |
4473 | 4506 |