Commit 0c3e73e84fe3f64cf1c2e8bb4e91e8901cbcdc38

Authored by Balbir Singh
Committed by Linus Torvalds
1 parent 4e41695356

memcg: improve resource counter scalability

Reduce the resource counter overhead (mostly spinlock) associated with the
root cgroup.  This is a part of the several patches to reduce mem cgroup
overhead.  I had posted other approaches earlier (including using percpu
counters).  Those patches will be a natural addition and will be added
iteratively on top of these.

The patch stops resource counter accounting for the root cgroup.  The data
for display is derived from the statisitcs we maintain via
mem_cgroup_charge_statistics (which is more scalable).  What happens today
is that, we do double accounting, once using res_counter_charge() and once
using memory_cgroup_charge_statistics().  For the root, since we don't
implement limits any more, we don't need to track every charge via
res_counter_charge() and check for limit being exceeded and reclaim.

The main mem->res usage_in_bytes can be derived by summing the cache and
rss usage data from memory statistics (MEM_CGROUP_STAT_RSS and
MEM_CGROUP_STAT_CACHE).  However, for memsw->res usage_in_bytes, we need
additional data about swapped out memory.  This patch adds a
MEM_CGROUP_STAT_SWAPOUT and uses that along with MEM_CGROUP_STAT_RSS and
MEM_CGROUP_STAT_CACHE to derive the memsw data.  This data is computed
recursively when hierarchy is enabled.

The tests results I see on a 24 way show that

1. The lock contention disappears from /proc/lock_stats
2. The results of the test are comparable to running with
   cgroup_disable=memory.

Here is a sample of my program runs

Without Patch

 Performance counter stats for '/home/balbir/parallel_pagefault':

 7192804.124144  task-clock-msecs         #     23.937 CPUs
         424691  context-switches         #      0.000 M/sec
            267  CPU-migrations           #      0.000 M/sec
       28498113  page-faults              #      0.004 M/sec
  5826093739340  cycles                   #    809.989 M/sec
   408883496292  instructions             #      0.070 IPC
     7057079452  cache-references         #      0.981 M/sec
     3036086243  cache-misses             #      0.422 M/sec

  300.485365680  seconds time elapsed

With cgroup_disable=memory

 Performance counter stats for '/home/balbir/parallel_pagefault':

 7182183.546587  task-clock-msecs         #     23.915 CPUs
         425458  context-switches         #      0.000 M/sec
            203  CPU-migrations           #      0.000 M/sec
       92545093  page-faults              #      0.013 M/sec
  6034363609986  cycles                   #    840.185 M/sec
   437204346785  instructions             #      0.072 IPC
     6636073192  cache-references         #      0.924 M/sec
     2358117732  cache-misses             #      0.328 M/sec

  300.320905827  seconds time elapsed

With this patch applied

 Performance counter stats for '/home/balbir/parallel_pagefault':

 7191619.223977  task-clock-msecs         #     23.955 CPUs
         422579  context-switches         #      0.000 M/sec
             88  CPU-migrations           #      0.000 M/sec
       91946060  page-faults              #      0.013 M/sec
  5957054385619  cycles                   #    828.333 M/sec
  1058117350365  instructions             #      0.178 IPC
     9161776218  cache-references         #      1.274 M/sec
     1920494280  cache-misses             #      0.267 M/sec

  300.218764862  seconds time elapsed

Data from Prarit (kernel compile with make -j64 on a 64
CPU/32G machine)

For a single run

Without patch

real 27m8.988s
user 87m24.916s
sys 382m6.037s

With patch

real    4m18.607s
user    84m58.943s
sys     50m52.682s

With config turned off

real    4m54.972s
user    90m13.456s
sys     50m19.711s

NOTE: The data looks counterintuitive due to the increased performance
with the patch, even over the config being turned off. We probably need
more runs, but so far all testing has shown that the patches definitely
help.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 100 additions and 21 deletions Side-by-side Diff

... ... @@ -70,6 +70,7 @@
70 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
  73 + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
73 74  
74 75 MEM_CGROUP_STAT_NSTATS,
75 76 };
76 77  
... ... @@ -478,11 +479,24 @@
478 479 return mz;
479 480 }
480 481  
  482 +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
  483 + bool charge)
  484 +{
  485 + int val = (charge) ? 1 : -1;
  486 + struct mem_cgroup_stat *stat = &mem->stat;
  487 + struct mem_cgroup_stat_cpu *cpustat;
  488 + int cpu = get_cpu();
  489 +
  490 + cpustat = &stat->cpustat[cpu];
  491 + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
  492 + put_cpu();
  493 +}
  494 +
481 495 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
482 496 struct page_cgroup *pc,
483 497 bool charge)
484 498 {
485   - int val = (charge)? 1 : -1;
  499 + int val = (charge) ? 1 : -1;
486 500 struct mem_cgroup_stat *stat = &mem->stat;
487 501 struct mem_cgroup_stat_cpu *cpustat;
488 502 int cpu = get_cpu();
489 503  
... ... @@ -1285,9 +1299,11 @@
1285 1299 VM_BUG_ON(css_is_removed(&mem->css));
1286 1300  
1287 1301 while (1) {
1288   - int ret;
  1302 + int ret = 0;
1289 1303 unsigned long flags = 0;
1290 1304  
  1305 + if (mem_cgroup_is_root(mem))
  1306 + goto done;
1291 1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1292 1308 &soft_fail_res);
1293 1309 if (likely(!ret)) {
... ... @@ -1347,6 +1363,7 @@
1347 1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1348 1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1349 1365 }
  1366 +done:
1350 1367 return 0;
1351 1368 nomem:
1352 1369 css_put(&mem->css);
... ... @@ -1419,9 +1436,12 @@
1419 1436 lock_page_cgroup(pc);
1420 1437 if (unlikely(PageCgroupUsed(pc))) {
1421 1438 unlock_page_cgroup(pc);
1422   - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1423   - if (do_swap_account)
1424   - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1439 + if (!mem_cgroup_is_root(mem)) {
  1440 + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
  1441 + if (do_swap_account)
  1442 + res_counter_uncharge(&mem->memsw, PAGE_SIZE,
  1443 + NULL);
  1444 + }
1425 1445 css_put(&mem->css);
1426 1446 return;
1427 1447 }
... ... @@ -1498,7 +1518,8 @@
1498 1518 if (pc->mem_cgroup != from)
1499 1519 goto out;
1500 1520  
1501   - res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
  1521 + if (!mem_cgroup_is_root(from))
  1522 + res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1502 1523 mem_cgroup_charge_statistics(from, pc, false);
1503 1524  
1504 1525 page = pc->page;
... ... @@ -1517,7 +1538,7 @@
1517 1538 1);
1518 1539 }
1519 1540  
1520   - if (do_swap_account)
  1541 + if (do_swap_account && !mem_cgroup_is_root(from))
1521 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1522 1543 css_put(&from->css);
1523 1544  
... ... @@ -1588,9 +1609,11 @@
1588 1609 /* drop extra refcnt by try_charge() */
1589 1610 css_put(&parent->css);
1590 1611 /* uncharge if move fails */
1591   - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1592   - if (do_swap_account)
1593   - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
  1612 + if (!mem_cgroup_is_root(parent)) {
  1613 + res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
  1614 + if (do_swap_account)
  1615 + res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
  1616 + }
1594 1617 return ret;
1595 1618 }
1596 1619  
... ... @@ -1779,7 +1802,10 @@
1779 1802 * This recorded memcg can be obsolete one. So, avoid
1780 1803 * calling css_tryget
1781 1804 */
1782   - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
  1805 + if (!mem_cgroup_is_root(memcg))
  1806 + res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
  1807 + NULL);
  1808 + mem_cgroup_swap_statistics(memcg, false);
1783 1809 mem_cgroup_put(memcg);
1784 1810 }
1785 1811 rcu_read_unlock();
... ... @@ -1804,9 +1830,11 @@
1804 1830 return;
1805 1831 if (!mem)
1806 1832 return;
1807   - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1808   - if (do_swap_account)
1809   - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1833 + if (!mem_cgroup_is_root(mem)) {
  1834 + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
  1835 + if (do_swap_account)
  1836 + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1837 + }
1810 1838 css_put(&mem->css);
1811 1839 }
1812 1840  
... ... @@ -1859,9 +1887,14 @@
1859 1887 break;
1860 1888 }
1861 1889  
1862   - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1863   - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1864   - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1890 + if (!mem_cgroup_is_root(mem)) {
  1891 + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
  1892 + if (do_swap_account &&
  1893 + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
  1894 + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
  1895 + }
  1896 + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
  1897 + mem_cgroup_swap_statistics(mem, true);
1865 1898 mem_cgroup_charge_statistics(mem, pc, false);
1866 1899  
1867 1900 ClearPageCgroupUsed(pc);
... ... @@ -1952,7 +1985,9 @@
1952 1985 * We uncharge this because swap is freed.
1953 1986 * This memcg can be obsolete one. We avoid calling css_tryget
1954 1987 */
1955   - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
  1988 + if (!mem_cgroup_is_root(memcg))
  1989 + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
  1990 + mem_cgroup_swap_statistics(memcg, false);
1956 1991 mem_cgroup_put(memcg);
1957 1992 }
1958 1993 rcu_read_unlock();
1959 1994  
1960 1995  
1961 1996  
... ... @@ -2464,20 +2499,64 @@
2464 2499 return retval;
2465 2500 }
2466 2501  
  2502 +struct mem_cgroup_idx_data {
  2503 + s64 val;
  2504 + enum mem_cgroup_stat_index idx;
  2505 +};
  2506 +
  2507 +static int
  2508 +mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
  2509 +{
  2510 + struct mem_cgroup_idx_data *d = data;
  2511 + d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
  2512 + return 0;
  2513 +}
  2514 +
  2515 +static void
  2516 +mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
  2517 + enum mem_cgroup_stat_index idx, s64 *val)
  2518 +{
  2519 + struct mem_cgroup_idx_data d;
  2520 + d.idx = idx;
  2521 + d.val = 0;
  2522 + mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
  2523 + *val = d.val;
  2524 +}
  2525 +
2467 2526 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2468 2527 {
2469 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2470   - u64 val = 0;
  2529 + u64 idx_val, val;
2471 2530 int type, name;
2472 2531  
2473 2532 type = MEMFILE_TYPE(cft->private);
2474 2533 name = MEMFILE_ATTR(cft->private);
2475 2534 switch (type) {
2476 2535 case _MEM:
2477   - val = res_counter_read_u64(&mem->res, name);
  2536 + if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
  2537 + mem_cgroup_get_recursive_idx_stat(mem,
  2538 + MEM_CGROUP_STAT_CACHE, &idx_val);
  2539 + val = idx_val;
  2540 + mem_cgroup_get_recursive_idx_stat(mem,
  2541 + MEM_CGROUP_STAT_RSS, &idx_val);
  2542 + val += idx_val;
  2543 + val <<= PAGE_SHIFT;
  2544 + } else
  2545 + val = res_counter_read_u64(&mem->res, name);
2478 2546 break;
2479 2547 case _MEMSWAP:
2480   - val = res_counter_read_u64(&mem->memsw, name);
  2548 + if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
  2549 + mem_cgroup_get_recursive_idx_stat(mem,
  2550 + MEM_CGROUP_STAT_CACHE, &idx_val);
  2551 + val = idx_val;
  2552 + mem_cgroup_get_recursive_idx_stat(mem,
  2553 + MEM_CGROUP_STAT_RSS, &idx_val);
  2554 + val += idx_val;
  2555 + mem_cgroup_get_recursive_idx_stat(mem,
  2556 + MEM_CGROUP_STAT_SWAPOUT, &idx_val);
  2557 + val <<= PAGE_SHIFT;
  2558 + } else
  2559 + val = res_counter_read_u64(&mem->memsw, name);
2481 2560 break;
2482 2561 default:
2483 2562 BUG();