Commit cdec2e4265dfa09490601b00aeabd8a8d4af30f0
Committed by
Linus Torvalds
1 parent
569b846df5
Exists in
master
and in
4 other branches
memcg: coalesce charging via percpu storage
This is a patch for coalescing access to res_counter at charging by percpu caching. At charge, memcg charges 64pages and remember it in percpu cache. Because it's cache, drain/flush if necessary. This version uses public percpu area. 2 benefits for using public percpu area. 1. Sum of stocked charge in the system is limited to # of cpus not to the number of memcg. This shows better synchonization. 2. drain code for flush/cpuhotplug is very easy (and quick) The most important point of this patch is that we never touch res_counter in fast path. The res_counter is system-wide shared counter which is modified very frequently. We shouldn't touch it as far as we can for avoiding false sharing. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 cache miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 cache miss/faults [ + coalescing uncharge patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 cache miss/faults [ + coalescing uncharge patch + this patch ] 34224709 page-faults # 0.072 M/sec ( +- 0.173% ) 34.69 cache miss/faults Changelog (since Oct/2): - updated comments - replaced get_cpu_var() with __get_cpu_var() if possible. - removed mutex for system-wide drain. adds a counter instead of it. - removed CONFIG_HOTPLUG_CPU Changelog (old): - rebased onto the latest mmotm - moved charge size check before __GFP_WAIT check for avoiding unnecesary - added asynchronous flush routine. - fixed bugs pointed out by Nishimura-san. [akpm@linux-foundation.org: tweak comments] [nishimura@mxp.nes.nec.co.jp: don't do INIT_WORK() repeatedly against the same work_struct] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 156 additions and 6 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -38,6 +38,7 @@ |
38 | 38 | #include <linux/vmalloc.h> |
39 | 39 | #include <linux/mm_inline.h> |
40 | 40 | #include <linux/page_cgroup.h> |
41 | +#include <linux/cpu.h> | |
41 | 42 | #include "internal.h" |
42 | 43 | |
43 | 44 | #include <asm/uaccess.h> |
... | ... | @@ -275,6 +276,7 @@ |
275 | 276 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | 277 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | 278 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
279 | +static void drain_all_stock_async(void); | |
278 | 280 | |
279 | 281 | static struct mem_cgroup_per_zone * |
280 | 282 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
... | ... | @@ -1137,6 +1139,8 @@ |
1137 | 1139 | victim = mem_cgroup_select_victim(root_mem); |
1138 | 1140 | if (victim == root_mem) { |
1139 | 1141 | loop++; |
1142 | + if (loop >= 1) | |
1143 | + drain_all_stock_async(); | |
1140 | 1144 | if (loop >= 2) { |
1141 | 1145 | /* |
1142 | 1146 | * If we have not been able to reclaim |
... | ... | @@ -1259,6 +1263,133 @@ |
1259 | 1263 | } |
1260 | 1264 | |
1261 | 1265 | /* |
1266 | + * size of first charge trial. "32" comes from vmscan.c's magic value. | |
1267 | + * TODO: maybe necessary to use big numbers in big irons. | |
1268 | + */ | |
1269 | +#define CHARGE_SIZE (32 * PAGE_SIZE) | |
1270 | +struct memcg_stock_pcp { | |
1271 | + struct mem_cgroup *cached; /* this never be root cgroup */ | |
1272 | + int charge; | |
1273 | + struct work_struct work; | |
1274 | +}; | |
1275 | +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | |
1276 | +static atomic_t memcg_drain_count; | |
1277 | + | |
1278 | +/* | |
1279 | + * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | |
1280 | + * from local stock and true is returned. If the stock is 0 or charges from a | |
1281 | + * cgroup which is not current target, returns false. This stock will be | |
1282 | + * refilled. | |
1283 | + */ | |
1284 | +static bool consume_stock(struct mem_cgroup *mem) | |
1285 | +{ | |
1286 | + struct memcg_stock_pcp *stock; | |
1287 | + bool ret = true; | |
1288 | + | |
1289 | + stock = &get_cpu_var(memcg_stock); | |
1290 | + if (mem == stock->cached && stock->charge) | |
1291 | + stock->charge -= PAGE_SIZE; | |
1292 | + else /* need to call res_counter_charge */ | |
1293 | + ret = false; | |
1294 | + put_cpu_var(memcg_stock); | |
1295 | + return ret; | |
1296 | +} | |
1297 | + | |
1298 | +/* | |
1299 | + * Returns stocks cached in percpu to res_counter and reset cached information. | |
1300 | + */ | |
1301 | +static void drain_stock(struct memcg_stock_pcp *stock) | |
1302 | +{ | |
1303 | + struct mem_cgroup *old = stock->cached; | |
1304 | + | |
1305 | + if (stock->charge) { | |
1306 | + res_counter_uncharge(&old->res, stock->charge); | |
1307 | + if (do_swap_account) | |
1308 | + res_counter_uncharge(&old->memsw, stock->charge); | |
1309 | + } | |
1310 | + stock->cached = NULL; | |
1311 | + stock->charge = 0; | |
1312 | +} | |
1313 | + | |
1314 | +/* | |
1315 | + * This must be called under preempt disabled or must be called by | |
1316 | + * a thread which is pinned to local cpu. | |
1317 | + */ | |
1318 | +static void drain_local_stock(struct work_struct *dummy) | |
1319 | +{ | |
1320 | + struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | |
1321 | + drain_stock(stock); | |
1322 | +} | |
1323 | + | |
1324 | +/* | |
1325 | + * Cache charges(val) which is from res_counter, to local per_cpu area. | |
1326 | + * This will be consumed by consumt_stock() function, later. | |
1327 | + */ | |
1328 | +static void refill_stock(struct mem_cgroup *mem, int val) | |
1329 | +{ | |
1330 | + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | |
1331 | + | |
1332 | + if (stock->cached != mem) { /* reset if necessary */ | |
1333 | + drain_stock(stock); | |
1334 | + stock->cached = mem; | |
1335 | + } | |
1336 | + stock->charge += val; | |
1337 | + put_cpu_var(memcg_stock); | |
1338 | +} | |
1339 | + | |
1340 | +/* | |
1341 | + * Tries to drain stocked charges in other cpus. This function is asynchronous | |
1342 | + * and just put a work per cpu for draining localy on each cpu. Caller can | |
1343 | + * expects some charges will be back to res_counter later but cannot wait for | |
1344 | + * it. | |
1345 | + */ | |
1346 | +static void drain_all_stock_async(void) | |
1347 | +{ | |
1348 | + int cpu; | |
1349 | + /* This function is for scheduling "drain" in asynchronous way. | |
1350 | + * The result of "drain" is not directly handled by callers. Then, | |
1351 | + * if someone is calling drain, we don't have to call drain more. | |
1352 | + * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | |
1353 | + * there is a race. We just do loose check here. | |
1354 | + */ | |
1355 | + if (atomic_read(&memcg_drain_count)) | |
1356 | + return; | |
1357 | + /* Notify other cpus that system-wide "drain" is running */ | |
1358 | + atomic_inc(&memcg_drain_count); | |
1359 | + get_online_cpus(); | |
1360 | + for_each_online_cpu(cpu) { | |
1361 | + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | |
1362 | + schedule_work_on(cpu, &stock->work); | |
1363 | + } | |
1364 | + put_online_cpus(); | |
1365 | + atomic_dec(&memcg_drain_count); | |
1366 | + /* We don't wait for flush_work */ | |
1367 | +} | |
1368 | + | |
1369 | +/* This is a synchronous drain interface. */ | |
1370 | +static void drain_all_stock_sync(void) | |
1371 | +{ | |
1372 | + /* called when force_empty is called */ | |
1373 | + atomic_inc(&memcg_drain_count); | |
1374 | + schedule_on_each_cpu(drain_local_stock); | |
1375 | + atomic_dec(&memcg_drain_count); | |
1376 | +} | |
1377 | + | |
1378 | +static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |
1379 | + unsigned long action, | |
1380 | + void *hcpu) | |
1381 | +{ | |
1382 | + int cpu = (unsigned long)hcpu; | |
1383 | + struct memcg_stock_pcp *stock; | |
1384 | + | |
1385 | + if (action != CPU_DEAD) | |
1386 | + return NOTIFY_OK; | |
1387 | + stock = &per_cpu(memcg_stock, cpu); | |
1388 | + drain_stock(stock); | |
1389 | + return NOTIFY_OK; | |
1390 | +} | |
1391 | + | |
1392 | +/* | |
1262 | 1393 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | 1394 | * oom-killer can be invoked. |
1264 | 1395 | */ |
... | ... | @@ -1269,6 +1400,7 @@ |
1269 | 1400 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | 1401 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | 1402 | struct res_counter *fail_res; |
1403 | + int csize = CHARGE_SIZE; | |
1272 | 1404 | |
1273 | 1405 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1274 | 1406 | /* Don't account this! */ |
1275 | 1407 | |
1276 | 1408 | |
1277 | 1409 | |
... | ... | @@ -1293,23 +1425,25 @@ |
1293 | 1425 | return 0; |
1294 | 1426 | |
1295 | 1427 | VM_BUG_ON(css_is_removed(&mem->css)); |
1428 | + if (mem_cgroup_is_root(mem)) | |
1429 | + goto done; | |
1296 | 1430 | |
1297 | 1431 | while (1) { |
1298 | 1432 | int ret = 0; |
1299 | 1433 | unsigned long flags = 0; |
1300 | 1434 | |
1301 | - if (mem_cgroup_is_root(mem)) | |
1302 | - goto done; | |
1303 | - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | |
1435 | + if (consume_stock(mem)) | |
1436 | + goto charged; | |
1437 | + | |
1438 | + ret = res_counter_charge(&mem->res, csize, &fail_res); | |
1304 | 1439 | if (likely(!ret)) { |
1305 | 1440 | if (!do_swap_account) |
1306 | 1441 | break; |
1307 | - ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | |
1308 | - &fail_res); | |
1442 | + ret = res_counter_charge(&mem->memsw, csize, &fail_res); | |
1309 | 1443 | if (likely(!ret)) |
1310 | 1444 | break; |
1311 | 1445 | /* mem+swap counter fails */ |
1312 | - res_counter_uncharge(&mem->res, PAGE_SIZE); | |
1446 | + res_counter_uncharge(&mem->res, csize); | |
1313 | 1447 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | 1448 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | 1449 | memsw); |
... | ... | @@ -1318,6 +1452,11 @@ |
1318 | 1452 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | 1453 | res); |
1320 | 1454 | |
1455 | + /* reduce request size and retry */ | |
1456 | + if (csize > PAGE_SIZE) { | |
1457 | + csize = PAGE_SIZE; | |
1458 | + continue; | |
1459 | + } | |
1321 | 1460 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | 1461 | goto nomem; |
1323 | 1462 | |
... | ... | @@ -1347,6 +1486,9 @@ |
1347 | 1486 | goto nomem; |
1348 | 1487 | } |
1349 | 1488 | } |
1489 | + if (csize > PAGE_SIZE) | |
1490 | + refill_stock(mem, csize - PAGE_SIZE); | |
1491 | +charged: | |
1350 | 1492 | /* |
1351 | 1493 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1352 | 1494 | * if they exceeds softlimit. |
... | ... | @@ -2469,6 +2611,7 @@ |
2469 | 2611 | goto out; |
2470 | 2612 | /* This is for making all *used* pages to be on LRU. */ |
2471 | 2613 | lru_add_drain_all(); |
2614 | + drain_all_stock_sync(); | |
2472 | 2615 | ret = 0; |
2473 | 2616 | for_each_node_state(node, N_HIGH_MEMORY) { |
2474 | 2617 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
2475 | 2618 | |
... | ... | @@ -3183,11 +3326,18 @@ |
3183 | 3326 | |
3184 | 3327 | /* root ? */ |
3185 | 3328 | if (cont->parent == NULL) { |
3329 | + int cpu; | |
3186 | 3330 | enable_swap_cgroup(); |
3187 | 3331 | parent = NULL; |
3188 | 3332 | root_mem_cgroup = mem; |
3189 | 3333 | if (mem_cgroup_soft_limit_tree_init()) |
3190 | 3334 | goto free_out; |
3335 | + for_each_possible_cpu(cpu) { | |
3336 | + struct memcg_stock_pcp *stock = | |
3337 | + &per_cpu(memcg_stock, cpu); | |
3338 | + INIT_WORK(&stock->work, drain_local_stock); | |
3339 | + } | |
3340 | + hotcpu_notifier(memcg_stock_cpu_callback, 0); | |
3191 | 3341 | |
3192 | 3342 | } else { |
3193 | 3343 | parent = mem_cgroup_from_cont(cont->parent); |