Commit 04046e1a0a34286382e913f8fc461440c21d88e8
Committed by
Linus Torvalds
1 parent
b4046f00ee
Exists in
master
and in
4 other branches
memcg: use CSS ID
Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy. Assume folloing tree. group_A (ID=3) /01 (ID=4) /0A (ID=7) /02 (ID=10) group_B (ID=5) and task in group_A/01/0A hits limit at group_A. reclaim will be done in following order (round-robin). group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10) -> group_A -> ..... Round robin by ID. The last visited cgroup is recorded and restart from it when it start reclaim again. (More smart algorithm can be implemented..) No cgroup_mutex or hierarchy_mutex is required. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 82 additions and 138 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -95,6 +95,15 @@ |
95 | 95 | return ret; |
96 | 96 | } |
97 | 97 | |
98 | +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | |
99 | +{ | |
100 | + s64 ret; | |
101 | + | |
102 | + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | |
103 | + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | |
104 | + return ret; | |
105 | +} | |
106 | + | |
98 | 107 | /* |
99 | 108 | * per-zone information in memory controller. |
100 | 109 | */ |
101 | 110 | |
... | ... | @@ -154,9 +163,9 @@ |
154 | 163 | |
155 | 164 | /* |
156 | 165 | * While reclaiming in a hiearchy, we cache the last child we |
157 | - * reclaimed from. Protected by hierarchy_mutex | |
166 | + * reclaimed from. | |
158 | 167 | */ |
159 | - struct mem_cgroup *last_scanned_child; | |
168 | + int last_scanned_child; | |
160 | 169 | /* |
161 | 170 | * Should the accounting and control be hierarchical, per subtree? |
162 | 171 | */ |
... | ... | @@ -629,103 +638,6 @@ |
629 | 638 | #define mem_cgroup_from_res_counter(counter, member) \ |
630 | 639 | container_of(counter, struct mem_cgroup, member) |
631 | 640 | |
632 | -/* | |
633 | - * This routine finds the DFS walk successor. This routine should be | |
634 | - * called with hierarchy_mutex held | |
635 | - */ | |
636 | -static struct mem_cgroup * | |
637 | -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) | |
638 | -{ | |
639 | - struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | |
640 | - | |
641 | - curr_cgroup = curr->css.cgroup; | |
642 | - root_cgroup = root_mem->css.cgroup; | |
643 | - | |
644 | - if (!list_empty(&curr_cgroup->children)) { | |
645 | - /* | |
646 | - * Walk down to children | |
647 | - */ | |
648 | - cgroup = list_entry(curr_cgroup->children.next, | |
649 | - struct cgroup, sibling); | |
650 | - curr = mem_cgroup_from_cont(cgroup); | |
651 | - goto done; | |
652 | - } | |
653 | - | |
654 | -visit_parent: | |
655 | - if (curr_cgroup == root_cgroup) { | |
656 | - /* caller handles NULL case */ | |
657 | - curr = NULL; | |
658 | - goto done; | |
659 | - } | |
660 | - | |
661 | - /* | |
662 | - * Goto next sibling | |
663 | - */ | |
664 | - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | |
665 | - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | |
666 | - sibling); | |
667 | - curr = mem_cgroup_from_cont(cgroup); | |
668 | - goto done; | |
669 | - } | |
670 | - | |
671 | - /* | |
672 | - * Go up to next parent and next parent's sibling if need be | |
673 | - */ | |
674 | - curr_cgroup = curr_cgroup->parent; | |
675 | - goto visit_parent; | |
676 | - | |
677 | -done: | |
678 | - return curr; | |
679 | -} | |
680 | - | |
681 | -/* | |
682 | - * Visit the first child (need not be the first child as per the ordering | |
683 | - * of the cgroup list, since we track last_scanned_child) of @mem and use | |
684 | - * that to reclaim free pages from. | |
685 | - */ | |
686 | -static struct mem_cgroup * | |
687 | -mem_cgroup_get_next_node(struct mem_cgroup *root_mem) | |
688 | -{ | |
689 | - struct cgroup *cgroup; | |
690 | - struct mem_cgroup *orig, *next; | |
691 | - bool obsolete; | |
692 | - | |
693 | - /* | |
694 | - * Scan all children under the mem_cgroup mem | |
695 | - */ | |
696 | - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | |
697 | - | |
698 | - orig = root_mem->last_scanned_child; | |
699 | - obsolete = mem_cgroup_is_obsolete(orig); | |
700 | - | |
701 | - if (list_empty(&root_mem->css.cgroup->children)) { | |
702 | - /* | |
703 | - * root_mem might have children before and last_scanned_child | |
704 | - * may point to one of them. We put it later. | |
705 | - */ | |
706 | - if (orig) | |
707 | - VM_BUG_ON(!obsolete); | |
708 | - next = NULL; | |
709 | - goto done; | |
710 | - } | |
711 | - | |
712 | - if (!orig || obsolete) { | |
713 | - cgroup = list_first_entry(&root_mem->css.cgroup->children, | |
714 | - struct cgroup, sibling); | |
715 | - next = mem_cgroup_from_cont(cgroup); | |
716 | - } else | |
717 | - next = __mem_cgroup_get_next_node(orig, root_mem); | |
718 | - | |
719 | -done: | |
720 | - if (next) | |
721 | - mem_cgroup_get(next); | |
722 | - root_mem->last_scanned_child = next; | |
723 | - if (orig) | |
724 | - mem_cgroup_put(orig); | |
725 | - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | |
726 | - return (next) ? next : root_mem; | |
727 | -} | |
728 | - | |
729 | 641 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) |
730 | 642 | { |
731 | 643 | if (do_swap_account) { |
732 | 644 | |
733 | 645 | |
734 | 646 | |
735 | 647 | |
736 | 648 | |
737 | 649 | |
... | ... | @@ -755,46 +667,79 @@ |
755 | 667 | } |
756 | 668 | |
757 | 669 | /* |
758 | - * Dance down the hierarchy if needed to reclaim memory. We remember the | |
759 | - * last child we reclaimed from, so that we don't end up penalizing | |
760 | - * one child extensively based on its position in the children list. | |
670 | + * Visit the first child (need not be the first child as per the ordering | |
671 | + * of the cgroup list, since we track last_scanned_child) of @mem and use | |
672 | + * that to reclaim free pages from. | |
673 | + */ | |
674 | +static struct mem_cgroup * | |
675 | +mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |
676 | +{ | |
677 | + struct mem_cgroup *ret = NULL; | |
678 | + struct cgroup_subsys_state *css; | |
679 | + int nextid, found; | |
680 | + | |
681 | + if (!root_mem->use_hierarchy) { | |
682 | + css_get(&root_mem->css); | |
683 | + ret = root_mem; | |
684 | + } | |
685 | + | |
686 | + while (!ret) { | |
687 | + rcu_read_lock(); | |
688 | + nextid = root_mem->last_scanned_child + 1; | |
689 | + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | |
690 | + &found); | |
691 | + if (css && css_tryget(css)) | |
692 | + ret = container_of(css, struct mem_cgroup, css); | |
693 | + | |
694 | + rcu_read_unlock(); | |
695 | + /* Updates scanning parameter */ | |
696 | + spin_lock(&root_mem->reclaim_param_lock); | |
697 | + if (!css) { | |
698 | + /* this means start scan from ID:1 */ | |
699 | + root_mem->last_scanned_child = 0; | |
700 | + } else | |
701 | + root_mem->last_scanned_child = found; | |
702 | + spin_unlock(&root_mem->reclaim_param_lock); | |
703 | + } | |
704 | + | |
705 | + return ret; | |
706 | +} | |
707 | + | |
708 | +/* | |
709 | + * Scan the hierarchy if needed to reclaim memory. We remember the last child | |
710 | + * we reclaimed from, so that we don't end up penalizing one child extensively | |
711 | + * based on its position in the children list. | |
761 | 712 | * |
762 | 713 | * root_mem is the original ancestor that we've been reclaim from. |
714 | + * | |
715 | + * We give up and return to the caller when we visit root_mem twice. | |
716 | + * (other groups can be removed while we're walking....) | |
763 | 717 | */ |
764 | 718 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
765 | 719 | gfp_t gfp_mask, bool noswap) |
766 | 720 | { |
767 | - struct mem_cgroup *next_mem; | |
768 | - int ret = 0; | |
721 | + struct mem_cgroup *victim; | |
722 | + int ret, total = 0; | |
723 | + int loop = 0; | |
769 | 724 | |
770 | - /* | |
771 | - * Reclaim unconditionally and don't check for return value. | |
772 | - * We need to reclaim in the current group and down the tree. | |
773 | - * One might think about checking for children before reclaiming, | |
774 | - * but there might be left over accounting, even after children | |
775 | - * have left. | |
776 | - */ | |
777 | - ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, | |
778 | - get_swappiness(root_mem)); | |
779 | - if (mem_cgroup_check_under_limit(root_mem)) | |
780 | - return 1; /* indicate reclaim has succeeded */ | |
781 | - if (!root_mem->use_hierarchy) | |
782 | - return ret; | |
783 | - | |
784 | - next_mem = mem_cgroup_get_next_node(root_mem); | |
785 | - | |
786 | - while (next_mem != root_mem) { | |
787 | - if (mem_cgroup_is_obsolete(next_mem)) { | |
788 | - next_mem = mem_cgroup_get_next_node(root_mem); | |
725 | + while (loop < 2) { | |
726 | + victim = mem_cgroup_select_victim(root_mem); | |
727 | + if (victim == root_mem) | |
728 | + loop++; | |
729 | + if (!mem_cgroup_local_usage(&victim->stat)) { | |
730 | + /* this cgroup's local usage == 0 */ | |
731 | + css_put(&victim->css); | |
789 | 732 | continue; |
790 | 733 | } |
791 | - ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, | |
792 | - get_swappiness(next_mem)); | |
734 | + /* we use swappiness of local cgroup */ | |
735 | + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | |
736 | + get_swappiness(victim)); | |
737 | + css_put(&victim->css); | |
738 | + total += ret; | |
793 | 739 | if (mem_cgroup_check_under_limit(root_mem)) |
794 | - return 1; /* indicate reclaim has succeeded */ | |
795 | - next_mem = mem_cgroup_get_next_node(root_mem); | |
740 | + return 1 + total; | |
796 | 741 | } |
797 | - return ret; | |
742 | + return total; | |
798 | 743 | } |
799 | 744 | |
800 | 745 | bool mem_cgroup_oom_called(struct task_struct *task) |
801 | 746 | |
... | ... | @@ -1324,8 +1269,8 @@ |
1324 | 1269 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1325 | 1270 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
1326 | 1271 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
1327 | - | |
1328 | 1272 | mem_cgroup_charge_statistics(mem, pc, false); |
1273 | + | |
1329 | 1274 | ClearPageCgroupUsed(pc); |
1330 | 1275 | /* |
1331 | 1276 | * pc->mem_cgroup is not cleared here. It will be accessed when it's |
... | ... | @@ -2178,6 +2123,8 @@ |
2178 | 2123 | { |
2179 | 2124 | int node; |
2180 | 2125 | |
2126 | + free_css_id(&mem_cgroup_subsys, &mem->css); | |
2127 | + | |
2181 | 2128 | for_each_node_state(node, N_POSSIBLE) |
2182 | 2129 | free_mem_cgroup_per_zone_info(mem, node); |
2183 | 2130 | |
2184 | 2131 | |
... | ... | @@ -2228,11 +2175,12 @@ |
2228 | 2175 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2229 | 2176 | { |
2230 | 2177 | struct mem_cgroup *mem, *parent; |
2178 | + long error = -ENOMEM; | |
2231 | 2179 | int node; |
2232 | 2180 | |
2233 | 2181 | mem = mem_cgroup_alloc(); |
2234 | 2182 | if (!mem) |
2235 | - return ERR_PTR(-ENOMEM); | |
2183 | + return ERR_PTR(error); | |
2236 | 2184 | |
2237 | 2185 | for_each_node_state(node, N_POSSIBLE) |
2238 | 2186 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
... | ... | @@ -2260,7 +2208,7 @@ |
2260 | 2208 | res_counter_init(&mem->res, NULL); |
2261 | 2209 | res_counter_init(&mem->memsw, NULL); |
2262 | 2210 | } |
2263 | - mem->last_scanned_child = NULL; | |
2211 | + mem->last_scanned_child = 0; | |
2264 | 2212 | spin_lock_init(&mem->reclaim_param_lock); |
2265 | 2213 | |
2266 | 2214 | if (parent) |
... | ... | @@ -2269,7 +2217,7 @@ |
2269 | 2217 | return &mem->css; |
2270 | 2218 | free_out: |
2271 | 2219 | __mem_cgroup_free(mem); |
2272 | - return ERR_PTR(-ENOMEM); | |
2220 | + return ERR_PTR(error); | |
2273 | 2221 | } |
2274 | 2222 | |
2275 | 2223 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
2276 | 2224 | |
... | ... | @@ -2284,12 +2232,7 @@ |
2284 | 2232 | struct cgroup *cont) |
2285 | 2233 | { |
2286 | 2234 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2287 | - struct mem_cgroup *last_scanned_child = mem->last_scanned_child; | |
2288 | 2235 | |
2289 | - if (last_scanned_child) { | |
2290 | - VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); | |
2291 | - mem_cgroup_put(last_scanned_child); | |
2292 | - } | |
2293 | 2236 | mem_cgroup_put(mem); |
2294 | 2237 | } |
2295 | 2238 | |
... | ... | @@ -2328,6 +2271,7 @@ |
2328 | 2271 | .populate = mem_cgroup_populate, |
2329 | 2272 | .attach = mem_cgroup_move_task, |
2330 | 2273 | .early_init = 0, |
2274 | + .use_id = 1, | |
2331 | 2275 | }; |
2332 | 2276 | |
2333 | 2277 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |