Commit f75ca962037ffd639a44fd88933cd9b84c4c4411

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 158e0a2d1b

memcg: avoid css_get()

Now, memory cgroup increments css(cgroup subsys state)'s reference count
per a charged page.  And the reference count is kept until the page is
uncharged.  But this has 2 bad effect.

 1. Because css_get/put calls atomic_inc()/dec, heavy call of them
    on large smp will not scale well.
 2. Because css's refcnt cannot be in a state as "ready-to-release",
    cgroup's notify_on_release handler can't work with memcg.
 3. css's refcnt is atomic_t, it means smaller than 32bit. Maybe too small.

This has been a problem since the 1st merge of memcg.

This is a trial to remove css's refcnt per a page. Even if we remove
refcnt, pre_destroy() does enough synchronization as
  - check res->usage == 0.
  - check no pages on LRU.

This patch removes css's refcnt per page.  Even after this patch, at the
1st look, it seems css_get() is still called in try_charge().

But the logic is.

  - If a memcg of mm->owner is cached one, consume_stock() will work.
    At success, return immediately.
  - If consume_stock returns false, css_get() is called and go to
    slow path which may be blocked. At the end of slow path,
    css_put() is called and restart from the start if necessary.

So, in the fast path, we don't call css_get() and can avoid access to
shared counter. This patch can make the most possible case fast.

Here is a result of multi-threaded page fault benchmark.

[Before]
    25.32%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
     9.30%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     8.02%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm <=====(*)
     7.83%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     5.38%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
     5.29%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     4.92%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     4.24%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     3.53%  multi-fault-all  [kernel.kallsyms]      [k] css_put
     2.11%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     1.76%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
     1.64%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge

[After]
    28.41%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
    10.08%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     9.58%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     9.38%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     5.86%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     5.65%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     2.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     2.64%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     2.48%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge

Then, 8.02% of try_get_mem_cgroup_from_mm() disappears because this patch
removes css_tryget() in it. (But yes, this is an extreme case.)

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 76 additions and 43 deletions Side-by-side Diff

... ... @@ -1714,28 +1714,66 @@
1714 1714 * thread group leader migrates. It's possible that mm is not
1715 1715 * set, if so charge the init_mm (happens for pagecache usage).
1716 1716 */
1717   - if (*memcg) {
  1717 + if (!*memcg && !mm)
  1718 + goto bypass;
  1719 +again:
  1720 + if (*memcg) { /* css should be a valid one */
1718 1721 mem = *memcg;
  1722 + VM_BUG_ON(css_is_removed(&mem->css));
  1723 + if (mem_cgroup_is_root(mem))
  1724 + goto done;
  1725 + if (consume_stock(mem))
  1726 + goto done;
1719 1727 css_get(&mem->css);
1720 1728 } else {
1721   - mem = try_get_mem_cgroup_from_mm(mm);
1722   - if (unlikely(!mem))
1723   - return 0;
1724   - *memcg = mem;
  1729 + struct task_struct *p;
  1730 +
  1731 + rcu_read_lock();
  1732 + p = rcu_dereference(mm->owner);
  1733 + VM_BUG_ON(!p);
  1734 + /*
  1735 + * because we don't have task_lock(), "p" can exit while
  1736 + * we're here. In that case, "mem" can point to root
  1737 + * cgroup but never be NULL. (and task_struct itself is freed
  1738 + * by RCU, cgroup itself is RCU safe.) Then, we have small
  1739 + * risk here to get wrong cgroup. But such kind of mis-account
  1740 + * by race always happens because we don't have cgroup_mutex().
  1741 + * It's overkill and we allow that small race, here.
  1742 + */
  1743 + mem = mem_cgroup_from_task(p);
  1744 + VM_BUG_ON(!mem);
  1745 + if (mem_cgroup_is_root(mem)) {
  1746 + rcu_read_unlock();
  1747 + goto done;
  1748 + }
  1749 + if (consume_stock(mem)) {
  1750 + /*
  1751 + * It seems dagerous to access memcg without css_get().
  1752 + * But considering how consume_stok works, it's not
  1753 + * necessary. If consume_stock success, some charges
  1754 + * from this memcg are cached on this cpu. So, we
  1755 + * don't need to call css_get()/css_tryget() before
  1756 + * calling consume_stock().
  1757 + */
  1758 + rcu_read_unlock();
  1759 + goto done;
  1760 + }
  1761 + /* after here, we may be blocked. we need to get refcnt */
  1762 + if (!css_tryget(&mem->css)) {
  1763 + rcu_read_unlock();
  1764 + goto again;
  1765 + }
  1766 + rcu_read_unlock();
1725 1767 }
1726 1768  
1727   - VM_BUG_ON(css_is_removed(&mem->css));
1728   - if (mem_cgroup_is_root(mem))
1729   - goto done;
1730   -
1731 1769 do {
1732 1770 bool oom_check;
1733 1771  
1734   - if (consume_stock(mem))
1735   - goto done; /* don't need to fill stock */
1736 1772 /* If killed, bypass charge */
1737   - if (fatal_signal_pending(current))
  1773 + if (fatal_signal_pending(current)) {
  1774 + css_put(&mem->css);
1738 1775 goto bypass;
  1776 + }
1739 1777  
1740 1778 oom_check = false;
1741 1779 if (oom && !nr_oom_retries) {
1742 1780  
1743 1781  
1744 1782  
1745 1783  
1746 1784  
1747 1785  
1748 1786  
1749 1787  
... ... @@ -1750,30 +1788,36 @@
1750 1788 break;
1751 1789 case CHARGE_RETRY: /* not in OOM situation but retry */
1752 1790 csize = PAGE_SIZE;
1753   - break;
  1791 + css_put(&mem->css);
  1792 + mem = NULL;
  1793 + goto again;
1754 1794 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
  1795 + css_put(&mem->css);
1755 1796 goto nomem;
1756 1797 case CHARGE_NOMEM: /* OOM routine works */
1757   - if (!oom)
  1798 + if (!oom) {
  1799 + css_put(&mem->css);
1758 1800 goto nomem;
  1801 + }
1759 1802 /* If oom, we never return -ENOMEM */
1760 1803 nr_oom_retries--;
1761 1804 break;
1762 1805 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
  1806 + css_put(&mem->css);
1763 1807 goto bypass;
1764 1808 }
1765 1809 } while (ret != CHARGE_OK);
1766 1810  
1767 1811 if (csize > PAGE_SIZE)
1768 1812 refill_stock(mem, csize - PAGE_SIZE);
  1813 + css_put(&mem->css);
1769 1814 done:
  1815 + *memcg = mem;
1770 1816 return 0;
1771 1817 nomem:
1772   - css_put(&mem->css);
  1818 + *memcg = NULL;
1773 1819 return -ENOMEM;
1774 1820 bypass:
1775   - if (mem)
1776   - css_put(&mem->css);
1777 1821 *memcg = NULL;
1778 1822 return 0;
1779 1823 }
1780 1824  
... ... @@ -1790,11 +1834,7 @@
1790 1834 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1791 1835 if (do_swap_account)
1792 1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1793   - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1794   - WARN_ON_ONCE(count > INT_MAX);
1795   - __css_put(&mem->css, (int)count);
1796 1837 }
1797   - /* we don't need css_put for root */
1798 1838 }
1799 1839  
1800 1840 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
... ... @@ -2155,7 +2195,6 @@
2155 2195 goto charge_cur_mm;
2156 2196 *ptr = mem;
2157 2197 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2158   - /* drop extra refcnt from tryget */
2159 2198 css_put(&mem->css);
2160 2199 return ret;
2161 2200 charge_cur_mm:
... ... @@ -2325,10 +2364,6 @@
2325 2364 break;
2326 2365 }
2327 2366  
2328   - if (!mem_cgroup_is_root(mem))
2329   - __do_uncharge(mem, ctype);
2330   - if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2331   - mem_cgroup_swap_statistics(mem, true);
2332 2367 mem_cgroup_charge_statistics(mem, pc, false);
2333 2368  
2334 2369 ClearPageCgroupUsed(pc);
2335 2370  
... ... @@ -2340,11 +2375,17 @@
2340 2375 */
2341 2376  
2342 2377 unlock_page_cgroup(pc);
2343   -
  2378 + /*
  2379 + * even after unlock, we have mem->res.usage here and this memcg
  2380 + * will never be freed.
  2381 + */
2344 2382 memcg_check_events(mem, page);
2345   - /* at swapout, this memcg will be accessed to record to swap */
2346   - if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2347   - css_put(&mem->css);
  2383 + if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
  2384 + mem_cgroup_swap_statistics(mem, true);
  2385 + mem_cgroup_get(mem);
  2386 + }
  2387 + if (!mem_cgroup_is_root(mem))
  2388 + __do_uncharge(mem, ctype);
2348 2389  
2349 2390 return mem;
2350 2391  
2351 2392  
... ... @@ -2431,13 +2472,12 @@
2431 2472  
2432 2473 memcg = __mem_cgroup_uncharge_common(page, ctype);
2433 2474  
2434   - /* record memcg information */
2435   - if (do_swap_account && swapout && memcg) {
  2475 + /*
  2476 + * record memcg information, if swapout && memcg != NULL,
  2477 + * mem_cgroup_get() was called in uncharge().
  2478 + */
  2479 + if (do_swap_account && swapout && memcg)
2436 2480 swap_cgroup_record(ent, css_id(&memcg->css));
2437   - mem_cgroup_get(memcg);
2438   - }
2439   - if (swapout && memcg)
2440   - css_put(&memcg->css);
2441 2481 }
2442 2482 #endif
2443 2483  
... ... @@ -2515,7 +2555,6 @@
2515 2555 */
2516 2556 if (!mem_cgroup_is_root(to))
2517 2557 res_counter_uncharge(&to->res, PAGE_SIZE);
2518   - css_put(&to->css);
2519 2558 }
2520 2559 return 0;
2521 2560 }
... ... @@ -4214,9 +4253,6 @@
4214 4253 goto one_by_one;
4215 4254 }
4216 4255 mc.precharge += count;
4217   - VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4218   - WARN_ON_ONCE(count > INT_MAX);
4219   - __css_get(&mem->css, (int)count);
4220 4256 return ret;
4221 4257 }
4222 4258 one_by_one:
... ... @@ -4452,7 +4488,6 @@
4452 4488 }
4453 4489 /* we must fixup refcnts and charges */
4454 4490 if (mc.moved_swap) {
4455   - WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4456 4491 /* uncharge swap account from the old cgroup */
4457 4492 if (!mem_cgroup_is_root(mc.from))
4458 4493 res_counter_uncharge(&mc.from->memsw,
... ... @@ -4466,8 +4501,6 @@
4466 4501 */
4467 4502 res_counter_uncharge(&mc.to->res,
4468 4503 PAGE_SIZE * mc.moved_swap);
4469   - VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4470   - __css_put(&mc.to->css, mc.moved_swap);
4471 4504 }
4472 4505 /* we've already done mem_cgroup_get(mc.to) */
4473 4506