Commit 4e649152cbaa1aedd01821d200ab9d597fe469e4

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 3dece8347d

memcg: some modification to softlimit under hierarchical memory reclaim.

This patch clean up/fixes for memcg's uncharge soft limit path.

Problems:
  Now, res_counter_charge()/uncharge() handles softlimit information at
  charge/uncharge and softlimit-check is done when event counter per memcg
  goes over limit. Now, event counter per memcg is updated only when
  memory usage is over soft limit. Here, considering hierarchical memcg
  management, ancesotors should be taken care of.

  Now, ancerstors(hierarchy) are handled in charge() but not in uncharge().
  This is not good.

  Prolems:
  1. memcg's event counter incremented only when softlimit hits. That's bad.
     It makes event counter hard to be reused for other purpose.

  2. At uncharge, only the lowest level rescounter is handled. This is bug.
     Because ancesotor's event counter is not incremented, children should
     take care of them.

  3. res_counter_uncharge()'s 3rd argument is NULL in most case.
     ops under res_counter->lock should be small. No "if" sentense is better.

Fixes:
  * Removed soft_limit_xx poitner and checks in charge and uncharge.
    Do-check-only-when-necessary scheme works enough well without them.

  * make event-counter of memcg incremented at every charge/uncharge.
    (per-cpu area will be accessed soon anyway)

  * All ancestors are checked at soft-limit-check. This is necessary because
    ancesotor's event counter may never be modified. Then, they should be
    checked at the same time.

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 54 additions and 83 deletions Side-by-side Diff

include/linux/res_counter.h
... ... @@ -114,8 +114,7 @@
114 114 int __must_check res_counter_charge_locked(struct res_counter *counter,
115 115 unsigned long val);
116 116 int __must_check res_counter_charge(struct res_counter *counter,
117   - unsigned long val, struct res_counter **limit_fail_at,
118   - struct res_counter **soft_limit_at);
  117 + unsigned long val, struct res_counter **limit_fail_at);
119 118  
120 119 /*
121 120 * uncharge - tell that some portion of the resource is released
... ... @@ -128,8 +127,7 @@
128 127 */
129 128  
130 129 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
131   -void res_counter_uncharge(struct res_counter *counter, unsigned long val,
132   - bool *was_soft_limit_excess);
  130 +void res_counter_uncharge(struct res_counter *counter, unsigned long val);
133 131  
134 132 static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
135 133 {
kernel/res_counter.c
... ... @@ -37,27 +37,17 @@
37 37 }
38 38  
39 39 int res_counter_charge(struct res_counter *counter, unsigned long val,
40   - struct res_counter **limit_fail_at,
41   - struct res_counter **soft_limit_fail_at)
  40 + struct res_counter **limit_fail_at)
42 41 {
43 42 int ret;
44 43 unsigned long flags;
45 44 struct res_counter *c, *u;
46 45  
47 46 *limit_fail_at = NULL;
48   - if (soft_limit_fail_at)
49   - *soft_limit_fail_at = NULL;
50 47 local_irq_save(flags);
51 48 for (c = counter; c != NULL; c = c->parent) {
52 49 spin_lock(&c->lock);
53 50 ret = res_counter_charge_locked(c, val);
54   - /*
55   - * With soft limits, we return the highest ancestor
56   - * that exceeds its soft limit
57   - */
58   - if (soft_limit_fail_at &&
59   - !res_counter_soft_limit_check_locked(c))
60   - *soft_limit_fail_at = c;
61 51 spin_unlock(&c->lock);
62 52 if (ret < 0) {
63 53 *limit_fail_at = c;
... ... @@ -85,8 +75,7 @@
85 75 counter->usage -= val;
86 76 }
87 77  
88   -void res_counter_uncharge(struct res_counter *counter, unsigned long val,
89   - bool *was_soft_limit_excess)
  78 +void res_counter_uncharge(struct res_counter *counter, unsigned long val)
90 79 {
91 80 unsigned long flags;
92 81 struct res_counter *c;
... ... @@ -94,9 +83,6 @@
94 83 local_irq_save(flags);
95 84 for (c = counter; c != NULL; c = c->parent) {
96 85 spin_lock(&c->lock);
97   - if (was_soft_limit_excess)
98   - *was_soft_limit_excess =
99   - !res_counter_soft_limit_check_locked(c);
100 86 res_counter_uncharge_locked(c, val);
101 87 spin_unlock(&c->lock);
102 88 }
... ... @@ -353,16 +353,6 @@
353 353 }
354 354  
355 355 static void
356   -mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357   - struct mem_cgroup_per_zone *mz,
358   - struct mem_cgroup_tree_per_zone *mctz)
359   -{
360   - spin_lock(&mctz->lock);
361   - __mem_cgroup_insert_exceeded(mem, mz, mctz);
362   - spin_unlock(&mctz->lock);
363   -}
364   -
365   -static void
366 356 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 357 struct mem_cgroup_per_zone *mz,
368 358 struct mem_cgroup_tree_per_zone *mctz)
369 359  
370 360  
371 361  
372 362  
... ... @@ -392,35 +382,41 @@
392 382  
393 383 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394 384 {
395   - unsigned long long prev_usage_in_excess, new_usage_in_excess;
396   - bool updated_tree = false;
  385 + unsigned long long new_usage_in_excess;
397 386 struct mem_cgroup_per_zone *mz;
398 387 struct mem_cgroup_tree_per_zone *mctz;
399   -
400   - mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
  388 + int nid = page_to_nid(page);
  389 + int zid = page_zonenum(page);
401 390 mctz = soft_limit_tree_from_page(page);
402 391  
403 392 /*
404   - * We do updates in lazy mode, mem's are removed
405   - * lazily from the per-zone, per-node rb tree
  393 + * Necessary to update all ancestors when hierarchy is used.
  394 + * because their event counter is not touched.
406 395 */
407   - prev_usage_in_excess = mz->usage_in_excess;
408   -
409   - new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
410   - if (prev_usage_in_excess) {
411   - mem_cgroup_remove_exceeded(mem, mz, mctz);
412   - updated_tree = true;
  396 + for (; mem; mem = parent_mem_cgroup(mem)) {
  397 + mz = mem_cgroup_zoneinfo(mem, nid, zid);
  398 + new_usage_in_excess =
  399 + res_counter_soft_limit_excess(&mem->res);
  400 + /*
  401 + * We have to update the tree if mz is on RB-tree or
  402 + * mem is over its softlimit.
  403 + */
  404 + if (new_usage_in_excess || mz->on_tree) {
  405 + spin_lock(&mctz->lock);
  406 + /* if on-tree, remove it */
  407 + if (mz->on_tree)
  408 + __mem_cgroup_remove_exceeded(mem, mz, mctz);
  409 + /*
  410 + * if over soft limit, insert again. mz->usage_in_excess
  411 + * will be updated properly.
  412 + */
  413 + if (new_usage_in_excess)
  414 + __mem_cgroup_insert_exceeded(mem, mz, mctz);
  415 + else
  416 + mz->usage_in_excess = 0;
  417 + spin_unlock(&mctz->lock);
  418 + }
413 419 }
414   - if (!new_usage_in_excess)
415   - goto done;
416   - mem_cgroup_insert_exceeded(mem, mz, mctz);
417   -
418   -done:
419   - if (updated_tree) {
420   - spin_lock(&mctz->lock);
421   - mz->usage_in_excess = new_usage_in_excess;
422   - spin_unlock(&mctz->lock);
423   - }
424 420 }
425 421  
426 422 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
427 423  
... ... @@ -1271,9 +1267,9 @@
1271 1267 gfp_t gfp_mask, struct mem_cgroup **memcg,
1272 1268 bool oom, struct page *page)
1273 1269 {
1274   - struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
  1270 + struct mem_cgroup *mem, *mem_over_limit;
1275 1271 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1276   - struct res_counter *fail_res, *soft_fail_res = NULL;
  1272 + struct res_counter *fail_res;
1277 1273  
1278 1274 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1279 1275 /* Don't account this! */
1280 1276  
1281 1277  
... ... @@ -1305,17 +1301,16 @@
1305 1301  
1306 1302 if (mem_cgroup_is_root(mem))
1307 1303 goto done;
1308   - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1309   - &soft_fail_res);
  1304 + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1310 1305 if (likely(!ret)) {
1311 1306 if (!do_swap_account)
1312 1307 break;
1313 1308 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1314   - &fail_res, NULL);
  1309 + &fail_res);
1315 1310 if (likely(!ret))
1316 1311 break;
1317 1312 /* mem+swap counter fails */
1318   - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
  1313 + res_counter_uncharge(&mem->res, PAGE_SIZE);
1319 1314 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1320 1315 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1321 1316 memsw);
1322 1317  
... ... @@ -1354,16 +1349,11 @@
1354 1349 }
1355 1350 }
1356 1351 /*
1357   - * Insert just the ancestor, we should trickle down to the correct
1358   - * cgroup for reclaim, since the other nodes will be below their
1359   - * soft limit
  1352 + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
  1353 + * if they exceeds softlimit.
1360 1354 */
1361   - if (soft_fail_res) {
1362   - mem_over_soft_limit =
1363   - mem_cgroup_from_res_counter(soft_fail_res, res);
1364   - if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1365   - mem_cgroup_update_tree(mem_over_soft_limit, page);
1366   - }
  1355 + if (mem_cgroup_soft_limit_check(mem))
  1356 + mem_cgroup_update_tree(mem, page);
1367 1357 done:
1368 1358 return 0;
1369 1359 nomem:
1370 1360  
... ... @@ -1438,10 +1428,9 @@
1438 1428 if (unlikely(PageCgroupUsed(pc))) {
1439 1429 unlock_page_cgroup(pc);
1440 1430 if (!mem_cgroup_is_root(mem)) {
1441   - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
  1431 + res_counter_uncharge(&mem->res, PAGE_SIZE);
1442 1432 if (do_swap_account)
1443   - res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1444   - NULL);
  1433 + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1445 1434 }
1446 1435 css_put(&mem->css);
1447 1436 return;
... ... @@ -1520,7 +1509,7 @@
1520 1509 goto out;
1521 1510  
1522 1511 if (!mem_cgroup_is_root(from))
1523   - res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
  1512 + res_counter_uncharge(&from->res, PAGE_SIZE);
1524 1513 mem_cgroup_charge_statistics(from, pc, false);
1525 1514  
1526 1515 page = pc->page;
... ... @@ -1540,7 +1529,7 @@
1540 1529 }
1541 1530  
1542 1531 if (do_swap_account && !mem_cgroup_is_root(from))
1543   - res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
  1532 + res_counter_uncharge(&from->memsw, PAGE_SIZE);
1544 1533 css_put(&from->css);
1545 1534  
1546 1535 css_get(&to->css);
1547 1536  
... ... @@ -1611,9 +1600,9 @@
1611 1600 css_put(&parent->css);
1612 1601 /* uncharge if move fails */
1613 1602 if (!mem_cgroup_is_root(parent)) {
1614   - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
  1603 + res_counter_uncharge(&parent->res, PAGE_SIZE);
1615 1604 if (do_swap_account)
1616   - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
  1605 + res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1617 1606 }
1618 1607 return ret;
1619 1608 }
... ... @@ -1804,8 +1793,7 @@
1804 1793 * calling css_tryget
1805 1794 */
1806 1795 if (!mem_cgroup_is_root(memcg))
1807   - res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1808   - NULL);
  1796 + res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1809 1797 mem_cgroup_swap_statistics(memcg, false);
1810 1798 mem_cgroup_put(memcg);
1811 1799 }
1812 1800  
... ... @@ -1832,9 +1820,9 @@
1832 1820 if (!mem)
1833 1821 return;
1834 1822 if (!mem_cgroup_is_root(mem)) {
1835   - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
  1823 + res_counter_uncharge(&mem->res, PAGE_SIZE);
1836 1824 if (do_swap_account)
1837   - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1825 + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1838 1826 }
1839 1827 css_put(&mem->css);
1840 1828 }
... ... @@ -1849,7 +1837,6 @@
1849 1837 struct page_cgroup *pc;
1850 1838 struct mem_cgroup *mem = NULL;
1851 1839 struct mem_cgroup_per_zone *mz;
1852   - bool soft_limit_excess = false;
1853 1840  
1854 1841 if (mem_cgroup_disabled())
1855 1842 return NULL;
1856 1843  
... ... @@ -1889,10 +1876,10 @@
1889 1876 }
1890 1877  
1891 1878 if (!mem_cgroup_is_root(mem)) {
1892   - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
  1879 + res_counter_uncharge(&mem->res, PAGE_SIZE);
1893 1880 if (do_swap_account &&
1894 1881 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1895   - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1882 + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1896 1883 }
1897 1884 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1898 1885 mem_cgroup_swap_statistics(mem, true);
... ... @@ -1909,7 +1896,7 @@
1909 1896 mz = page_cgroup_zoneinfo(pc);
1910 1897 unlock_page_cgroup(pc);
1911 1898  
1912   - if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
  1899 + if (mem_cgroup_soft_limit_check(mem))
1913 1900 mem_cgroup_update_tree(mem, page);
1914 1901 /* at swapout, this memcg will be accessed to record to swap */
1915 1902 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
... ... @@ -1987,7 +1974,7 @@
1987 1974 * This memcg can be obsolete one. We avoid calling css_tryget
1988 1975 */
1989 1976 if (!mem_cgroup_is_root(memcg))
1990   - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
  1977 + res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1991 1978 mem_cgroup_swap_statistics(memcg, false);
1992 1979 mem_cgroup_put(memcg);
1993 1980 }