Commit 4e41695356fb4e0b153be1440ad027e46e0a7ea2

Authored by Balbir Singh
Committed by Linus Torvalds
1 parent 75822b4495

memory controller: soft limit reclaim on contention

Implement reclaim from groups over their soft limit

Permit reclaim from memory cgroups on contention (via the direct reclaim
path).

memory cgroup soft limit reclaim finds the group that exceeds its soft
limit by the largest number of pages and reclaims pages from it and then
reinserts the cgroup into its correct place in the rbtree.

Add additional checks to mem_cgroup_hierarchical_reclaim() to detect long
loops in case all swap is turned off.  The code has been refactored and
the loop check (loop < 2) has been enhanced for soft limits.  For soft
limits, we try to do more targetted reclaim.  Instead of bailing out after
two loops, the routine now reclaims memory proportional to the size by
which the soft limit is exceeded.  The proportion has been empirically
determined.

[akpm@linux-foundation.org: build fix]
[kamezawa.hiroyu@jp.fujitsu.com: fix softlimit css refcnt handling]
[nishimura@mxp.nes.nec.co.jp: refcount of the "victim" should be decremented before exiting the loop]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 271 additions and 15 deletions Side-by-side Diff

include/linux/memcontrol.h
... ... @@ -118,6 +118,9 @@
118 118  
119 119 extern bool mem_cgroup_oom_called(struct task_struct *task);
120 120 void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
  121 +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  122 + gfp_t gfp_mask, int nid,
  123 + int zid);
121 124 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
122 125 struct mem_cgroup;
123 126  
... ... @@ -274,6 +277,13 @@
274 277 static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
275 278 int val)
276 279 {
  280 +}
  281 +
  282 +static inline
  283 +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  284 + gfp_t gfp_mask, int nid, int zid)
  285 +{
  286 + return 0;
277 287 }
278 288  
279 289 #endif /* CONFIG_CGROUP_MEM_CONT */
include/linux/swap.h
... ... @@ -217,6 +217,11 @@
217 217 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
218 218 gfp_t gfp_mask, bool noswap,
219 219 unsigned int swappiness);
  220 +extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
  221 + gfp_t gfp_mask, bool noswap,
  222 + unsigned int swappiness,
  223 + struct zone *zone,
  224 + int nid);
220 225 extern int __isolate_lru_page(struct page *page, int mode, int file);
221 226 extern unsigned long shrink_all_memory(unsigned long nr_pages);
222 227 extern int vm_swappiness;
... ... @@ -139,6 +139,8 @@
139 139 unsigned long long usage_in_excess;/* Set to the value by which */
140 140 /* the soft limit is exceeded*/
141 141 bool on_tree;
  142 + struct mem_cgroup *mem; /* Back pointer, we cannot */
  143 + /* use container_of */
142 144 };
143 145 /* Macro for accessing counter */
144 146 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
... ... @@ -228,6 +230,13 @@
228 230 struct mem_cgroup_stat stat;
229 231 };
230 232  
  233 +/*
  234 + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  235 + * limit reclaim to prevent infinite loops, if they ever occur.
  236 + */
  237 +#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
  238 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
  239 +
231 240 enum charge_type {
232 241 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
233 242 MEM_CGROUP_CHARGE_TYPE_MAPPED,
... ... @@ -259,6 +268,8 @@
259 268 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
260 269 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
261 270 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
  271 +#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
  272 +#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
262 273  
263 274 static void mem_cgroup_get(struct mem_cgroup *mem);
264 275 static void mem_cgroup_put(struct mem_cgroup *mem);
... ... @@ -299,7 +310,7 @@
299 310 }
300 311  
301 312 static void
302   -mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
  313 +__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
303 314 struct mem_cgroup_per_zone *mz,
304 315 struct mem_cgroup_tree_per_zone *mctz)
305 316 {
... ... @@ -311,7 +322,6 @@
311 322 return;
312 323  
313 324 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
314   - spin_lock(&mctz->lock);
315 325 while (*p) {
316 326 parent = *p;
317 327 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
... ... @@ -328,6 +338,26 @@
328 338 rb_link_node(&mz->tree_node, parent, p);
329 339 rb_insert_color(&mz->tree_node, &mctz->rb_root);
330 340 mz->on_tree = true;
  341 +}
  342 +
  343 +static void
  344 +__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
  345 + struct mem_cgroup_per_zone *mz,
  346 + struct mem_cgroup_tree_per_zone *mctz)
  347 +{
  348 + if (!mz->on_tree)
  349 + return;
  350 + rb_erase(&mz->tree_node, &mctz->rb_root);
  351 + mz->on_tree = false;
  352 +}
  353 +
  354 +static void
  355 +mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
  356 + struct mem_cgroup_per_zone *mz,
  357 + struct mem_cgroup_tree_per_zone *mctz)
  358 +{
  359 + spin_lock(&mctz->lock);
  360 + __mem_cgroup_insert_exceeded(mem, mz, mctz);
331 361 spin_unlock(&mctz->lock);
332 362 }
333 363  
... ... @@ -337,8 +367,7 @@
337 367 struct mem_cgroup_tree_per_zone *mctz)
338 368 {
339 369 spin_lock(&mctz->lock);
340   - rb_erase(&mz->tree_node, &mctz->rb_root);
341   - mz->on_tree = false;
  370 + __mem_cgroup_remove_exceeded(mem, mz, mctz);
342 371 spin_unlock(&mctz->lock);
343 372 }
344 373  
... ... @@ -408,6 +437,47 @@
408 437 }
409 438 }
410 439  
  440 +static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
  441 +{
  442 + return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
  443 +}
  444 +
  445 +static struct mem_cgroup_per_zone *
  446 +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  447 +{
  448 + struct rb_node *rightmost = NULL;
  449 + struct mem_cgroup_per_zone *mz = NULL;
  450 +
  451 +retry:
  452 + rightmost = rb_last(&mctz->rb_root);
  453 + if (!rightmost)
  454 + goto done; /* Nothing to reclaim from */
  455 +
  456 + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  457 + /*
  458 + * Remove the node now but someone else can add it back,
  459 + * we will to add it back at the end of reclaim to its correct
  460 + * position in the tree.
  461 + */
  462 + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
  463 + if (!res_counter_soft_limit_excess(&mz->mem->res) ||
  464 + !css_tryget(&mz->mem->css))
  465 + goto retry;
  466 +done:
  467 + return mz;
  468 +}
  469 +
  470 +static struct mem_cgroup_per_zone *
  471 +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  472 +{
  473 + struct mem_cgroup_per_zone *mz;
  474 +
  475 + spin_lock(&mctz->lock);
  476 + mz = __mem_cgroup_largest_soft_limit_node(mctz);
  477 + spin_unlock(&mctz->lock);
  478 + return mz;
  479 +}
  480 +
411 481 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
412 482 struct page_cgroup *pc,
413 483 bool charge)
... ... @@ -1037,6 +1107,7 @@
1037 1107 * If shrink==true, for avoiding to free too much, this returns immedieately.
1038 1108 */
1039 1109 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
  1110 + struct zone *zone,
1040 1111 gfp_t gfp_mask,
1041 1112 unsigned long reclaim_options)
1042 1113 {
1043 1114  
1044 1115  
1045 1116  
1046 1117  
... ... @@ -1045,23 +1116,53 @@
1045 1116 int loop = 0;
1046 1117 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1047 1118 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
  1119 + bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
  1120 + unsigned long excess = mem_cgroup_get_excess(root_mem);
1048 1121  
1049 1122 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1050 1123 if (root_mem->memsw_is_minimum)
1051 1124 noswap = true;
1052 1125  
1053   - while (loop < 2) {
  1126 + while (1) {
1054 1127 victim = mem_cgroup_select_victim(root_mem);
1055   - if (victim == root_mem)
  1128 + if (victim == root_mem) {
1056 1129 loop++;
  1130 + if (loop >= 2) {
  1131 + /*
  1132 + * If we have not been able to reclaim
  1133 + * anything, it might because there are
  1134 + * no reclaimable pages under this hierarchy
  1135 + */
  1136 + if (!check_soft || !total) {
  1137 + css_put(&victim->css);
  1138 + break;
  1139 + }
  1140 + /*
  1141 + * We want to do more targetted reclaim.
  1142 + * excess >> 2 is not to excessive so as to
  1143 + * reclaim too much, nor too less that we keep
  1144 + * coming back to reclaim from this cgroup
  1145 + */
  1146 + if (total >= (excess >> 2) ||
  1147 + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
  1148 + css_put(&victim->css);
  1149 + break;
  1150 + }
  1151 + }
  1152 + }
1057 1153 if (!mem_cgroup_local_usage(&victim->stat)) {
1058 1154 /* this cgroup's local usage == 0 */
1059 1155 css_put(&victim->css);
1060 1156 continue;
1061 1157 }
1062 1158 /* we use swappiness of local cgroup */
1063   - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
1064   - get_swappiness(victim));
  1159 + if (check_soft)
  1160 + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
  1161 + noswap, get_swappiness(victim), zone,
  1162 + zone->zone_pgdat->node_id);
  1163 + else
  1164 + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
  1165 + noswap, get_swappiness(victim));
1065 1166 css_put(&victim->css);
1066 1167 /*
1067 1168 * At shrinking usage, we can't check we should stop here or
... ... @@ -1071,7 +1172,10 @@
1071 1172 if (shrink)
1072 1173 return ret;
1073 1174 total += ret;
1074   - if (mem_cgroup_check_under_limit(root_mem))
  1175 + if (check_soft) {
  1176 + if (res_counter_check_under_soft_limit(&root_mem->res))
  1177 + return total;
  1178 + } else if (mem_cgroup_check_under_limit(root_mem))
1075 1179 return 1 + total;
1076 1180 }
1077 1181 return total;
... ... @@ -1206,8 +1310,8 @@
1206 1310 if (!(gfp_mask & __GFP_WAIT))
1207 1311 goto nomem;
1208 1312  
1209   - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
1210   - flags);
  1313 + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
  1314 + gfp_mask, flags);
1211 1315 if (ret)
1212 1316 continue;
1213 1317  
... ... @@ -2018,8 +2122,9 @@
2018 2122 if (!ret)
2019 2123 break;
2020 2124  
2021   - progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
2022   - MEM_CGROUP_RECLAIM_SHRINK);
  2125 + progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
  2126 + GFP_KERNEL,
  2127 + MEM_CGROUP_RECLAIM_SHRINK);
2023 2128 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2024 2129 /* Usage is reduced ? */
2025 2130 if (curusage >= oldusage)
... ... @@ -2071,7 +2176,7 @@
2071 2176 if (!ret)
2072 2177 break;
2073 2178  
2074   - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
  2179 + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2075 2180 MEM_CGROUP_RECLAIM_NOSWAP |
2076 2181 MEM_CGROUP_RECLAIM_SHRINK);
2077 2182 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
... ... @@ -2084,6 +2189,97 @@
2084 2189 return ret;
2085 2190 }
2086 2191  
  2192 +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  2193 + gfp_t gfp_mask, int nid,
  2194 + int zid)
  2195 +{
  2196 + unsigned long nr_reclaimed = 0;
  2197 + struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  2198 + unsigned long reclaimed;
  2199 + int loop = 0;
  2200 + struct mem_cgroup_tree_per_zone *mctz;
  2201 +
  2202 + if (order > 0)
  2203 + return 0;
  2204 +
  2205 + mctz = soft_limit_tree_node_zone(nid, zid);
  2206 + /*
  2207 + * This loop can run a while, specially if mem_cgroup's continuously
  2208 + * keep exceeding their soft limit and putting the system under
  2209 + * pressure
  2210 + */
  2211 + do {
  2212 + if (next_mz)
  2213 + mz = next_mz;
  2214 + else
  2215 + mz = mem_cgroup_largest_soft_limit_node(mctz);
  2216 + if (!mz)
  2217 + break;
  2218 +
  2219 + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
  2220 + gfp_mask,
  2221 + MEM_CGROUP_RECLAIM_SOFT);
  2222 + nr_reclaimed += reclaimed;
  2223 + spin_lock(&mctz->lock);
  2224 +
  2225 + /*
  2226 + * If we failed to reclaim anything from this memory cgroup
  2227 + * it is time to move on to the next cgroup
  2228 + */
  2229 + next_mz = NULL;
  2230 + if (!reclaimed) {
  2231 + do {
  2232 + /*
  2233 + * Loop until we find yet another one.
  2234 + *
  2235 + * By the time we get the soft_limit lock
  2236 + * again, someone might have aded the
  2237 + * group back on the RB tree. Iterate to
  2238 + * make sure we get a different mem.
  2239 + * mem_cgroup_largest_soft_limit_node returns
  2240 + * NULL if no other cgroup is present on
  2241 + * the tree
  2242 + */
  2243 + next_mz =
  2244 + __mem_cgroup_largest_soft_limit_node(mctz);
  2245 + if (next_mz == mz) {
  2246 + css_put(&next_mz->mem->css);
  2247 + next_mz = NULL;
  2248 + } else /* next_mz == NULL or other memcg */
  2249 + break;
  2250 + } while (1);
  2251 + }
  2252 + mz->usage_in_excess =
  2253 + res_counter_soft_limit_excess(&mz->mem->res);
  2254 + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
  2255 + /*
  2256 + * One school of thought says that we should not add
  2257 + * back the node to the tree if reclaim returns 0.
  2258 + * But our reclaim could return 0, simply because due
  2259 + * to priority we are exposing a smaller subset of
  2260 + * memory to reclaim from. Consider this as a longer
  2261 + * term TODO.
  2262 + */
  2263 + if (mz->usage_in_excess)
  2264 + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
  2265 + spin_unlock(&mctz->lock);
  2266 + css_put(&mz->mem->css);
  2267 + loop++;
  2268 + /*
  2269 + * Could not reclaim anything and there are no more
  2270 + * mem cgroups to try or we seem to be looping without
  2271 + * reclaiming anything.
  2272 + */
  2273 + if (!nr_reclaimed &&
  2274 + (next_mz == NULL ||
  2275 + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  2276 + break;
  2277 + } while (!nr_reclaimed);
  2278 + if (next_mz)
  2279 + css_put(&next_mz->mem->css);
  2280 + return nr_reclaimed;
  2281 +}
  2282 +
2087 2283 /*
2088 2284 * This routine traverse page_cgroup in given list and drop them all.
2089 2285 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
... ... @@ -2686,6 +2882,8 @@
2686 2882 for_each_lru(l)
2687 2883 INIT_LIST_HEAD(&mz->lists[l]);
2688 2884 mz->usage_in_excess = 0;
  2885 + mz->on_tree = false;
  2886 + mz->mem = mem;
2689 2887 }
2690 2888 return 0;
2691 2889 }
... ... @@ -1836,11 +1836,45 @@
1836 1836  
1837 1837 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
1838 1838  
  1839 +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
  1840 + gfp_t gfp_mask, bool noswap,
  1841 + unsigned int swappiness,
  1842 + struct zone *zone, int nid)
  1843 +{
  1844 + struct scan_control sc = {
  1845 + .may_writepage = !laptop_mode,
  1846 + .may_unmap = 1,
  1847 + .may_swap = !noswap,
  1848 + .swap_cluster_max = SWAP_CLUSTER_MAX,
  1849 + .swappiness = swappiness,
  1850 + .order = 0,
  1851 + .mem_cgroup = mem,
  1852 + .isolate_pages = mem_cgroup_isolate_pages,
  1853 + };
  1854 + nodemask_t nm = nodemask_of_node(nid);
  1855 +
  1856 + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
  1857 + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
  1858 + sc.nodemask = &nm;
  1859 + sc.nr_reclaimed = 0;
  1860 + sc.nr_scanned = 0;
  1861 + /*
  1862 + * NOTE: Although we can get the priority field, using it
  1863 + * here is not a good idea, since it limits the pages we can scan.
  1864 + * if we don't reclaim here, the shrink_zone from balance_pgdat
  1865 + * will pick up pages from other mem cgroup's as well. We hack
  1866 + * the priority and make it zero.
  1867 + */
  1868 + shrink_zone(0, zone, &sc);
  1869 + return sc.nr_reclaimed;
  1870 +}
  1871 +
1839 1872 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1840 1873 gfp_t gfp_mask,
1841 1874 bool noswap,
1842 1875 unsigned int swappiness)
1843 1876 {
  1877 + struct zonelist *zonelist;
1844 1878 struct scan_control sc = {
1845 1879 .may_writepage = !laptop_mode,
1846 1880 .may_unmap = 1,
... ... @@ -1852,7 +1886,6 @@
1852 1886 .isolate_pages = mem_cgroup_isolate_pages,
1853 1887 .nodemask = NULL, /* we don't care the placement */
1854 1888 };
1855   - struct zonelist *zonelist;
1856 1889  
1857 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1858 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
... ... @@ -1974,6 +2007,7 @@
1974 2007 for (i = 0; i <= end_zone; i++) {
1975 2008 struct zone *zone = pgdat->node_zones + i;
1976 2009 int nr_slab;
  2010 + int nid, zid;
1977 2011  
1978 2012 if (!populated_zone(zone))
1979 2013 continue;
... ... @@ -1988,6 +2022,15 @@
1988 2022 temp_priority[i] = priority;
1989 2023 sc.nr_scanned = 0;
1990 2024 note_zone_scanning_priority(zone, priority);
  2025 +
  2026 + nid = pgdat->node_id;
  2027 + zid = zone_idx(zone);
  2028 + /*
  2029 + * Call soft limit reclaim before calling shrink_zone.
  2030 + * For now we ignore the return value
  2031 + */
  2032 + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
  2033 + nid, zid);
1991 2034 /*
1992 2035 * We put equal pressure on every zone, unless one
1993 2036 * zone has way too many pages free already.