Commit f86296317434b21585e229f6c49a33cb9ebab4d3

Authored by Wu Fengguang
Committed by Linus Torvalds
1 parent 1a8670a29b

mm: do batched scans for mem_cgroup

For mem_cgroup, shrink_zone() may call shrink_list() with nr_to_scan=1, in
which case shrink_list() _still_ calls isolate_pages() with the much
larger SWAP_CLUSTER_MAX.  It effectively scales up the inactive list scan
rate by up to 32 times.

For example, with 16k inactive pages and DEF_PRIORITY=12, (16k >> 12)=4.
So when shrink_zone() expects to scan 4 pages in the active/inactive list,
the active list will be scanned 4 pages, while the inactive list will be
(over) scanned SWAP_CLUSTER_MAX=32 pages in effect.  And that could break
the balance between the two lists.

It can further impact the scan of anon active list, due to the anon
active/inactive ratio rebalance logic in balance_pgdat()/shrink_zone():

inactive anon list over scanned => inactive_anon_is_low() == TRUE
                                => shrink_active_list()
                                => active anon list over scanned

So the end result may be

- anon inactive  => over scanned
- anon active    => over scanned (maybe not as much)
- file inactive  => over scanned
- file active    => under scanned (relatively)

The accesses to nr_saved_scan are not lock protected and so not 100%
accurate, however we can tolerate small errors and the resulted small
imbalanced scan rates between zones.

Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 17 additions and 11 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -273,6 +273,11 @@
273 273 */
274 274 unsigned long recent_rotated[2];
275 275 unsigned long recent_scanned[2];
  276 +
  277 + /*
  278 + * accumulated for batching
  279 + */
  280 + unsigned long nr_saved_scan[NR_LRU_LISTS];
276 281 };
277 282  
278 283 struct zone {
... ... @@ -327,7 +332,6 @@
327 332 spinlock_t lru_lock;
328 333 struct zone_lru {
329 334 struct list_head list;
330   - unsigned long nr_saved_scan; /* accumulated for batching */
331 335 } lru[NR_LRU_LISTS];
332 336  
333 337 struct zone_reclaim_stat reclaim_stat;
... ... @@ -3809,7 +3809,7 @@
3809 3809 zone_pcp_init(zone);
3810 3810 for_each_lru(l) {
3811 3811 INIT_LIST_HEAD(&zone->lru[l].list);
3812   - zone->lru[l].nr_saved_scan = 0;
  3812 + zone->reclaim_stat.nr_saved_scan[l] = 0;
3813 3813 }
3814 3814 zone->reclaim_stat.recent_rotated[0] = 0;
3815 3815 zone->reclaim_stat.recent_rotated[1] = 0;
... ... @@ -1586,6 +1586,7 @@
1586 1586 enum lru_list l;
1587 1587 unsigned long nr_reclaimed = sc->nr_reclaimed;
1588 1588 unsigned long swap_cluster_max = sc->swap_cluster_max;
  1589 + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1589 1590 int noswap = 0;
1590 1591  
1591 1592 /* If we have no swap space, do not bother scanning anon pages. */
... ... @@ -1605,12 +1606,9 @@
1605 1606 scan >>= priority;
1606 1607 scan = (scan * percent[file]) / 100;
1607 1608 }
1608   - if (scanning_global_lru(sc))
1609   - nr[l] = nr_scan_try_batch(scan,
1610   - &zone->lru[l].nr_saved_scan,
1611   - swap_cluster_max);
1612   - else
1613   - nr[l] = scan;
  1609 + nr[l] = nr_scan_try_batch(scan,
  1610 + &reclaim_stat->nr_saved_scan[l],
  1611 + swap_cluster_max);
1614 1612 }
1615 1613  
1616 1614 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
... ... @@ -2220,6 +2218,7 @@
2220 2218 {
2221 2219 struct zone *zone;
2222 2220 unsigned long nr_reclaimed = 0;
  2221 + struct zone_reclaim_stat *reclaim_stat;
2223 2222  
2224 2223 for_each_populated_zone(zone) {
2225 2224 enum lru_list l;
2226 2225  
... ... @@ -2236,11 +2235,14 @@
2236 2235 l == LRU_ACTIVE_FILE))
2237 2236 continue;
2238 2237  
2239   - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2240   - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
  2238 + reclaim_stat = get_reclaim_stat(zone, sc);
  2239 + reclaim_stat->nr_saved_scan[l] +=
  2240 + (lru_pages >> prio) + 1;
  2241 + if (reclaim_stat->nr_saved_scan[l]
  2242 + >= nr_pages || pass > 3) {
2241 2243 unsigned long nr_to_scan;
2242 2244  
2243   - zone->lru[l].nr_saved_scan = 0;
  2245 + reclaim_stat->nr_saved_scan[l] = 0;
2244 2246 nr_to_scan = min(nr_pages, lru_pages);
2245 2247 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2246 2248 sc, prio);