Commit 04046e1a0a34286382e913f8fc461440c21d88e8

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent b4046f00ee

memcg: use CSS ID

Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy.

	Assume folloing tree.

	group_A (ID=3)
		/01 (ID=4)
		   /0A (ID=7)
		/02 (ID=10)
	group_B (ID=5)
	and task in group_A/01/0A hits limit at group_A.

	reclaim will be done in following order (round-robin).
	group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10)
	-> group_A -> .....

	Round robin by ID. The last visited cgroup is recorded and restart
	from it when it start reclaim again.
	(More smart algorithm can be implemented..)

	No cgroup_mutex or hierarchy_mutex is required.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 82 additions and 138 deletions Side-by-side Diff

... ... @@ -95,6 +95,15 @@
95 95 return ret;
96 96 }
97 97  
  98 +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
  99 +{
  100 + s64 ret;
  101 +
  102 + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
  103 + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
  104 + return ret;
  105 +}
  106 +
98 107 /*
99 108 * per-zone information in memory controller.
100 109 */
101 110  
... ... @@ -154,9 +163,9 @@
154 163  
155 164 /*
156 165 * While reclaiming in a hiearchy, we cache the last child we
157   - * reclaimed from. Protected by hierarchy_mutex
  166 + * reclaimed from.
158 167 */
159   - struct mem_cgroup *last_scanned_child;
  168 + int last_scanned_child;
160 169 /*
161 170 * Should the accounting and control be hierarchical, per subtree?
162 171 */
... ... @@ -629,103 +638,6 @@
629 638 #define mem_cgroup_from_res_counter(counter, member) \
630 639 container_of(counter, struct mem_cgroup, member)
631 640  
632   -/*
633   - * This routine finds the DFS walk successor. This routine should be
634   - * called with hierarchy_mutex held
635   - */
636   -static struct mem_cgroup *
637   -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638   -{
639   - struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
640   -
641   - curr_cgroup = curr->css.cgroup;
642   - root_cgroup = root_mem->css.cgroup;
643   -
644   - if (!list_empty(&curr_cgroup->children)) {
645   - /*
646   - * Walk down to children
647   - */
648   - cgroup = list_entry(curr_cgroup->children.next,
649   - struct cgroup, sibling);
650   - curr = mem_cgroup_from_cont(cgroup);
651   - goto done;
652   - }
653   -
654   -visit_parent:
655   - if (curr_cgroup == root_cgroup) {
656   - /* caller handles NULL case */
657   - curr = NULL;
658   - goto done;
659   - }
660   -
661   - /*
662   - * Goto next sibling
663   - */
664   - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665   - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666   - sibling);
667   - curr = mem_cgroup_from_cont(cgroup);
668   - goto done;
669   - }
670   -
671   - /*
672   - * Go up to next parent and next parent's sibling if need be
673   - */
674   - curr_cgroup = curr_cgroup->parent;
675   - goto visit_parent;
676   -
677   -done:
678   - return curr;
679   -}
680   -
681   -/*
682   - * Visit the first child (need not be the first child as per the ordering
683   - * of the cgroup list, since we track last_scanned_child) of @mem and use
684   - * that to reclaim free pages from.
685   - */
686   -static struct mem_cgroup *
687   -mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688   -{
689   - struct cgroup *cgroup;
690   - struct mem_cgroup *orig, *next;
691   - bool obsolete;
692   -
693   - /*
694   - * Scan all children under the mem_cgroup mem
695   - */
696   - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697   -
698   - orig = root_mem->last_scanned_child;
699   - obsolete = mem_cgroup_is_obsolete(orig);
700   -
701   - if (list_empty(&root_mem->css.cgroup->children)) {
702   - /*
703   - * root_mem might have children before and last_scanned_child
704   - * may point to one of them. We put it later.
705   - */
706   - if (orig)
707   - VM_BUG_ON(!obsolete);
708   - next = NULL;
709   - goto done;
710   - }
711   -
712   - if (!orig || obsolete) {
713   - cgroup = list_first_entry(&root_mem->css.cgroup->children,
714   - struct cgroup, sibling);
715   - next = mem_cgroup_from_cont(cgroup);
716   - } else
717   - next = __mem_cgroup_get_next_node(orig, root_mem);
718   -
719   -done:
720   - if (next)
721   - mem_cgroup_get(next);
722   - root_mem->last_scanned_child = next;
723   - if (orig)
724   - mem_cgroup_put(orig);
725   - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
726   - return (next) ? next : root_mem;
727   -}
728   -
729 641 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
730 642 {
731 643 if (do_swap_account) {
732 644  
733 645  
734 646  
735 647  
736 648  
737 649  
... ... @@ -755,46 +667,79 @@
755 667 }
756 668  
757 669 /*
758   - * Dance down the hierarchy if needed to reclaim memory. We remember the
759   - * last child we reclaimed from, so that we don't end up penalizing
760   - * one child extensively based on its position in the children list.
  670 + * Visit the first child (need not be the first child as per the ordering
  671 + * of the cgroup list, since we track last_scanned_child) of @mem and use
  672 + * that to reclaim free pages from.
  673 + */
  674 +static struct mem_cgroup *
  675 +mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  676 +{
  677 + struct mem_cgroup *ret = NULL;
  678 + struct cgroup_subsys_state *css;
  679 + int nextid, found;
  680 +
  681 + if (!root_mem->use_hierarchy) {
  682 + css_get(&root_mem->css);
  683 + ret = root_mem;
  684 + }
  685 +
  686 + while (!ret) {
  687 + rcu_read_lock();
  688 + nextid = root_mem->last_scanned_child + 1;
  689 + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
  690 + &found);
  691 + if (css && css_tryget(css))
  692 + ret = container_of(css, struct mem_cgroup, css);
  693 +
  694 + rcu_read_unlock();
  695 + /* Updates scanning parameter */
  696 + spin_lock(&root_mem->reclaim_param_lock);
  697 + if (!css) {
  698 + /* this means start scan from ID:1 */
  699 + root_mem->last_scanned_child = 0;
  700 + } else
  701 + root_mem->last_scanned_child = found;
  702 + spin_unlock(&root_mem->reclaim_param_lock);
  703 + }
  704 +
  705 + return ret;
  706 +}
  707 +
  708 +/*
  709 + * Scan the hierarchy if needed to reclaim memory. We remember the last child
  710 + * we reclaimed from, so that we don't end up penalizing one child extensively
  711 + * based on its position in the children list.
761 712 *
762 713 * root_mem is the original ancestor that we've been reclaim from.
  714 + *
  715 + * We give up and return to the caller when we visit root_mem twice.
  716 + * (other groups can be removed while we're walking....)
763 717 */
764 718 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 719 gfp_t gfp_mask, bool noswap)
766 720 {
767   - struct mem_cgroup *next_mem;
768   - int ret = 0;
  721 + struct mem_cgroup *victim;
  722 + int ret, total = 0;
  723 + int loop = 0;
769 724  
770   - /*
771   - * Reclaim unconditionally and don't check for return value.
772   - * We need to reclaim in the current group and down the tree.
773   - * One might think about checking for children before reclaiming,
774   - * but there might be left over accounting, even after children
775   - * have left.
776   - */
777   - ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
778   - get_swappiness(root_mem));
779   - if (mem_cgroup_check_under_limit(root_mem))
780   - return 1; /* indicate reclaim has succeeded */
781   - if (!root_mem->use_hierarchy)
782   - return ret;
783   -
784   - next_mem = mem_cgroup_get_next_node(root_mem);
785   -
786   - while (next_mem != root_mem) {
787   - if (mem_cgroup_is_obsolete(next_mem)) {
788   - next_mem = mem_cgroup_get_next_node(root_mem);
  725 + while (loop < 2) {
  726 + victim = mem_cgroup_select_victim(root_mem);
  727 + if (victim == root_mem)
  728 + loop++;
  729 + if (!mem_cgroup_local_usage(&victim->stat)) {
  730 + /* this cgroup's local usage == 0 */
  731 + css_put(&victim->css);
789 732 continue;
790 733 }
791   - ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
792   - get_swappiness(next_mem));
  734 + /* we use swappiness of local cgroup */
  735 + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
  736 + get_swappiness(victim));
  737 + css_put(&victim->css);
  738 + total += ret;
793 739 if (mem_cgroup_check_under_limit(root_mem))
794   - return 1; /* indicate reclaim has succeeded */
795   - next_mem = mem_cgroup_get_next_node(root_mem);
  740 + return 1 + total;
796 741 }
797   - return ret;
  742 + return total;
798 743 }
799 744  
800 745 bool mem_cgroup_oom_called(struct task_struct *task)
801 746  
... ... @@ -1324,8 +1269,8 @@
1324 1269 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 1270 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 1271 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327   -
1328 1272 mem_cgroup_charge_statistics(mem, pc, false);
  1273 +
1329 1274 ClearPageCgroupUsed(pc);
1330 1275 /*
1331 1276 * pc->mem_cgroup is not cleared here. It will be accessed when it's
... ... @@ -2178,6 +2123,8 @@
2178 2123 {
2179 2124 int node;
2180 2125  
  2126 + free_css_id(&mem_cgroup_subsys, &mem->css);
  2127 +
2181 2128 for_each_node_state(node, N_POSSIBLE)
2182 2129 free_mem_cgroup_per_zone_info(mem, node);
2183 2130  
2184 2131  
... ... @@ -2228,11 +2175,12 @@
2228 2175 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229 2176 {
2230 2177 struct mem_cgroup *mem, *parent;
  2178 + long error = -ENOMEM;
2231 2179 int node;
2232 2180  
2233 2181 mem = mem_cgroup_alloc();
2234 2182 if (!mem)
2235   - return ERR_PTR(-ENOMEM);
  2183 + return ERR_PTR(error);
2236 2184  
2237 2185 for_each_node_state(node, N_POSSIBLE)
2238 2186 if (alloc_mem_cgroup_per_zone_info(mem, node))
... ... @@ -2260,7 +2208,7 @@
2260 2208 res_counter_init(&mem->res, NULL);
2261 2209 res_counter_init(&mem->memsw, NULL);
2262 2210 }
2263   - mem->last_scanned_child = NULL;
  2211 + mem->last_scanned_child = 0;
2264 2212 spin_lock_init(&mem->reclaim_param_lock);
2265 2213  
2266 2214 if (parent)
... ... @@ -2269,7 +2217,7 @@
2269 2217 return &mem->css;
2270 2218 free_out:
2271 2219 __mem_cgroup_free(mem);
2272   - return ERR_PTR(-ENOMEM);
  2220 + return ERR_PTR(error);
2273 2221 }
2274 2222  
2275 2223 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 2224  
... ... @@ -2284,12 +2232,7 @@
2284 2232 struct cgroup *cont)
2285 2233 {
2286 2234 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2287   - struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2288 2235  
2289   - if (last_scanned_child) {
2290   - VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2291   - mem_cgroup_put(last_scanned_child);
2292   - }
2293 2236 mem_cgroup_put(mem);
2294 2237 }
2295 2238  
... ... @@ -2328,6 +2271,7 @@
2328 2271 .populate = mem_cgroup_populate,
2329 2272 .attach = mem_cgroup_move_task,
2330 2273 .early_init = 0,
  2274 + .use_id = 1,
2331 2275 };
2332 2276  
2333 2277 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP