Commit 81d39c20f5ee2437d71709beb82597e2a38efbbc
Committed by
Linus Torvalds
1 parent
14067bb3e2
Exists in
master
and in
4 other branches
memcg: fix shrinking memory to return -EBUSY by fixing retry algorithm
As pointed out, shrinking memcg's limit should return -EBUSY after reasonable retries. This patch tries to fix the current behavior of shrink_usage. Before looking into "shrink should return -EBUSY" problem, we should fix hierarchical reclaim code. It compares current usage and current limit, but it only makes sense when the kernel reclaims memory because hit limits. This is also a problem. What this patch does are. 1. add new argument "shrink" to hierarchical reclaim. If "shrink==true", hierarchical reclaim returns immediately and the caller checks the kernel should shrink more or not. (At shrinking memory, usage is always smaller than limit. So check for usage < limit is useless.) 2. For adjusting to above change, 2 changes in "shrink"'s retry path. 2-a. retry_count depends on # of children because the kernel visits the children under hierarchy one by one. 2-b. rather than checking return value of hierarchical_reclaim's progress, compares usage-before-shrink and usage-after-shrink. If usage-before-shrink <= usage-after-shrink, retry_count is decremented. Reported-by: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 59 additions and 12 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -702,7 +702,24 @@ |
702 | 702 | return swappiness; |
703 | 703 | } |
704 | 704 | |
705 | +static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |
706 | +{ | |
707 | + int *val = data; | |
708 | + (*val)++; | |
709 | + return 0; | |
710 | +} | |
705 | 711 | /* |
712 | + * This function returns the number of memcg under hierarchy tree. Returns | |
713 | + * 1(self count) if no children. | |
714 | + */ | |
715 | +static int mem_cgroup_count_children(struct mem_cgroup *mem) | |
716 | +{ | |
717 | + int num = 0; | |
718 | + mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); | |
719 | + return num; | |
720 | +} | |
721 | + | |
722 | +/* | |
706 | 723 | * Visit the first child (need not be the first child as per the ordering |
707 | 724 | * of the cgroup list, since we track last_scanned_child) of @mem and use |
708 | 725 | * that to reclaim free pages from. |
709 | 726 | |
... | ... | @@ -750,9 +767,11 @@ |
750 | 767 | * |
751 | 768 | * We give up and return to the caller when we visit root_mem twice. |
752 | 769 | * (other groups can be removed while we're walking....) |
770 | + * | |
771 | + * If shrink==true, for avoiding to free too much, this returns immedieately. | |
753 | 772 | */ |
754 | 773 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
755 | - gfp_t gfp_mask, bool noswap) | |
774 | + gfp_t gfp_mask, bool noswap, bool shrink) | |
756 | 775 | { |
757 | 776 | struct mem_cgroup *victim; |
758 | 777 | int ret, total = 0; |
... | ... | @@ -771,6 +790,13 @@ |
771 | 790 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, |
772 | 791 | get_swappiness(victim)); |
773 | 792 | css_put(&victim->css); |
793 | + /* | |
794 | + * At shrinking usage, we can't check we should stop here or | |
795 | + * reclaim more. It's depends on callers. last_scanned_child | |
796 | + * will work enough for keeping fairness under tree. | |
797 | + */ | |
798 | + if (shrink) | |
799 | + return ret; | |
774 | 800 | total += ret; |
775 | 801 | if (mem_cgroup_check_under_limit(root_mem)) |
776 | 802 | return 1 + total; |
... | ... | @@ -856,7 +882,7 @@ |
856 | 882 | goto nomem; |
857 | 883 | |
858 | 884 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, |
859 | - noswap); | |
885 | + noswap, false); | |
860 | 886 | if (ret) |
861 | 887 | continue; |
862 | 888 | |
... | ... | @@ -1489,7 +1515,8 @@ |
1489 | 1515 | return 0; |
1490 | 1516 | |
1491 | 1517 | do { |
1492 | - progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); | |
1518 | + progress = mem_cgroup_hierarchical_reclaim(mem, | |
1519 | + gfp_mask, true, false); | |
1493 | 1520 | progress += mem_cgroup_check_under_limit(mem); |
1494 | 1521 | } while (!progress && --retry); |
1495 | 1522 | |
1496 | 1523 | |
1497 | 1524 | |
... | ... | @@ -1504,12 +1531,22 @@ |
1504 | 1531 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
1505 | 1532 | unsigned long long val) |
1506 | 1533 | { |
1507 | - | |
1508 | - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | |
1534 | + int retry_count; | |
1509 | 1535 | int progress; |
1510 | 1536 | u64 memswlimit; |
1511 | 1537 | int ret = 0; |
1538 | + int children = mem_cgroup_count_children(memcg); | |
1539 | + u64 curusage, oldusage; | |
1512 | 1540 | |
1541 | + /* | |
1542 | + * For keeping hierarchical_reclaim simple, how long we should retry | |
1543 | + * is depends on callers. We set our retry-count to be function | |
1544 | + * of # of children which we should visit in this loop. | |
1545 | + */ | |
1546 | + retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; | |
1547 | + | |
1548 | + oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | |
1549 | + | |
1513 | 1550 | while (retry_count) { |
1514 | 1551 | if (signal_pending(current)) { |
1515 | 1552 | ret = -EINTR; |
... | ... | @@ -1534,8 +1571,13 @@ |
1534 | 1571 | break; |
1535 | 1572 | |
1536 | 1573 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, |
1537 | - false); | |
1538 | - if (!progress) retry_count--; | |
1574 | + false, true); | |
1575 | + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | |
1576 | + /* Usage is reduced ? */ | |
1577 | + if (curusage >= oldusage) | |
1578 | + retry_count--; | |
1579 | + else | |
1580 | + oldusage = curusage; | |
1539 | 1581 | } |
1540 | 1582 | |
1541 | 1583 | return ret; |
1542 | 1584 | |
1543 | 1585 | |
... | ... | @@ -1544,13 +1586,16 @@ |
1544 | 1586 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
1545 | 1587 | unsigned long long val) |
1546 | 1588 | { |
1547 | - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | |
1589 | + int retry_count; | |
1548 | 1590 | u64 memlimit, oldusage, curusage; |
1549 | - int ret; | |
1591 | + int children = mem_cgroup_count_children(memcg); | |
1592 | + int ret = -EBUSY; | |
1550 | 1593 | |
1551 | 1594 | if (!do_swap_account) |
1552 | 1595 | return -EINVAL; |
1553 | - | |
1596 | + /* see mem_cgroup_resize_res_limit */ | |
1597 | + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | |
1598 | + oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | |
1554 | 1599 | while (retry_count) { |
1555 | 1600 | if (signal_pending(current)) { |
1556 | 1601 | ret = -EINTR; |
1557 | 1602 | |
1558 | 1603 | |
... | ... | @@ -1574,11 +1619,13 @@ |
1574 | 1619 | if (!ret) |
1575 | 1620 | break; |
1576 | 1621 | |
1577 | - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | |
1578 | - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); | |
1622 | + mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); | |
1579 | 1623 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
1624 | + /* Usage is reduced ? */ | |
1580 | 1625 | if (curusage >= oldusage) |
1581 | 1626 | retry_count--; |
1627 | + else | |
1628 | + oldusage = curusage; | |
1582 | 1629 | } |
1583 | 1630 | return ret; |
1584 | 1631 | } |