Blame view

mm/vmscan.c 120 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
13
  /*
   *  linux/mm/vmscan.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *
   *  Swap reorganised 29.12.95, Stephen Tweedie.
   *  kswapd added: 7.1.96  sct
   *  Removed kswapd_ctl limits, and swap out as many pages as needed
   *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
   *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
b1de0d139   Mitchel Humpherys   mm: convert some ...
14
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
  #include <linux/mm.h>
5b3cc15af   Ingo Molnar   sched/headers: Pr...
16
  #include <linux/sched/mm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
17
  #include <linux/module.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
18
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
  #include <linux/pagemap.h>
  #include <linux/init.h>
  #include <linux/highmem.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
24
  #include <linux/vmpressure.h>
e129b5c23   Andrew Morton   [PATCH] vm: add p...
25
  #include <linux/vmstat.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
26
27
28
29
30
31
  #include <linux/file.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>	/* for try_to_release_page(),
  					buffer_heads_over_limit */
  #include <linux/mm_inline.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
35
36
  #include <linux/backing-dev.h>
  #include <linux/rmap.h>
  #include <linux/topology.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
3e7d34497   Mel Gorman   mm: vmscan: recla...
37
  #include <linux/compaction.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
39
  #include <linux/notifier.h>
  #include <linux/rwsem.h>
248a0301e   Rafael J. Wysocki   [PATCH] mm: make ...
40
  #include <linux/delay.h>
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
41
  #include <linux/kthread.h>
7dfb71030   Nigel Cunningham   [PATCH] Add inclu...
42
  #include <linux/freezer.h>
66e1707bc   Balbir Singh   Memory controller...
43
  #include <linux/memcontrol.h>
873b47717   Keika Kobayashi   per-task-delay-ac...
44
  #include <linux/delayacct.h>
af936a160   Lee Schermerhorn   vmscan: unevictab...
45
  #include <linux/sysctl.h>
929bea7c7   KOSAKI Motohiro   vmscan: all_unrec...
46
  #include <linux/oom.h>
268bb0ce3   Linus Torvalds   sanitize <linux/p...
47
  #include <linux/prefetch.h>
b1de0d139   Mitchel Humpherys   mm: convert some ...
48
  #include <linux/printk.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
49
  #include <linux/dax.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
50
51
52
53
54
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
  
  #include <linux/swapops.h>
117aad1e9   Rafael Aquini   mm: avoid reinser...
55
  #include <linux/balloon_compaction.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
56

0f8053a50   Nick Piggin   [PATCH] mm: make ...
57
  #include "internal.h"
33906bc5c   Mel Gorman   vmscan: tracing: ...
58
59
  #define CREATE_TRACE_POINTS
  #include <trace/events/vmscan.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
  struct scan_control {
22fba3354   KOSAKI Motohiro   vmscan: separate ...
61
62
  	/* How many pages shrink_list() should reclaim */
  	unsigned long nr_to_reclaim;
ee814fe23   Johannes Weiner   mm: vmscan: clean...
63
64
65
66
67
  	/*
  	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
  	 * are scanned.
  	 */
  	nodemask_t	*nodemask;
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
68

5f53e7629   KOSAKI Motohiro   vmscan: page_chec...
69
  	/*
f16015fbf   Johannes Weiner   mm: vmscan: disti...
70
71
72
73
  	 * The memory cgroup that hit its limit and as a result is the
  	 * primary target of this reclaim invocation.
  	 */
  	struct mem_cgroup *target_mem_cgroup;
66e1707bc   Balbir Singh   Memory controller...
74

1276ad68e   Johannes Weiner   mm: vmscan: scan ...
75
  	/* Writepage batching in laptop mode; RECLAIM_WRITE */
ee814fe23   Johannes Weiner   mm: vmscan: clean...
76
77
78
79
80
81
82
  	unsigned int may_writepage:1;
  
  	/* Can mapped pages be reclaimed? */
  	unsigned int may_unmap:1;
  
  	/* Can pages be swapped as part of reclaim? */
  	unsigned int may_swap:1;
d6622f636   Yisheng Xie   mm/vmscan: more r...
83
84
85
86
87
88
89
  	/*
  	 * Cgroups are not reclaimed below their configured memory.low,
  	 * unless we threaten to OOM. If any cgroups are skipped due to
  	 * memory.low and nothing was reclaimed, go back for memory.low.
  	 */
  	unsigned int memcg_low_reclaim:1;
  	unsigned int memcg_low_skipped:1;
241994ed8   Johannes Weiner   mm: memcontrol: d...
90

ee814fe23   Johannes Weiner   mm: vmscan: clean...
91
92
93
94
  	unsigned int hibernation_mode:1;
  
  	/* One of the zones is ready for compaction */
  	unsigned int compaction_ready:1;
bb451fdf3   Greg Thelen   mm/vmscan.c: cond...
95
96
97
98
99
100
101
102
103
104
105
  	/* Allocation order */
  	s8 order;
  
  	/* Scan (total_size >> priority) pages at once */
  	s8 priority;
  
  	/* The highest zone to isolate pages for reclaim from */
  	s8 reclaim_idx;
  
  	/* This context's GFP mask */
  	gfp_t gfp_mask;
ee814fe23   Johannes Weiner   mm: vmscan: clean...
106
107
108
109
110
  	/* Incremented by the number of inactive pages that were scanned */
  	unsigned long nr_scanned;
  
  	/* Number of pages freed so far during a call to shrink_zones() */
  	unsigned long nr_reclaimed;
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
111
112
113
114
115
116
117
118
119
120
  
  	struct {
  		unsigned int dirty;
  		unsigned int unqueued_dirty;
  		unsigned int congested;
  		unsigned int writeback;
  		unsigned int immediate;
  		unsigned int file_taken;
  		unsigned int taken;
  	} nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
121
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
  #ifdef ARCH_HAS_PREFETCH
  #define prefetch_prev_lru_page(_page, _base, _field)			\
  	do {								\
  		if ((_page)->lru.prev != _base) {			\
  			struct page *prev;				\
  									\
  			prev = lru_to_page(&(_page->lru));		\
  			prefetch(&prev->_field);			\
  		}							\
  	} while (0)
  #else
  #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
  #ifdef ARCH_HAS_PREFETCHW
  #define prefetchw_prev_lru_page(_page, _base, _field)			\
  	do {								\
  		if ((_page)->lru.prev != _base) {			\
  			struct page *prev;				\
  									\
  			prev = lru_to_page(&(_page->lru));		\
  			prefetchw(&prev->_field);			\
  		}							\
  	} while (0)
  #else
  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
  /*
   * From 0 .. 100.  Higher means more swappy.
   */
  int vm_swappiness = 60;
d0480be44   Wang Sheng-Hui   mm: update the de...
154
155
156
157
158
  /*
   * The total number of pages which are beyond the high watermark within all
   * zones.
   */
  unsigned long vm_total_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
159
160
161
  
  static LIST_HEAD(shrinker_list);
  static DECLARE_RWSEM(shrinker_rwsem);
b4c2b231c   Kirill Tkhai   mm: assign id to ...
162
  #ifdef CONFIG_MEMCG_KMEM
7e010df53   Kirill Tkhai   mm: use special v...
163
164
165
166
167
168
169
170
171
172
173
174
175
  
  /*
   * We allow subsystems to populate their shrinker-related
   * LRU lists before register_shrinker_prepared() is called
   * for the shrinker, since we don't want to impose
   * restrictions on their internal registration order.
   * In this case shrink_slab_memcg() may find corresponding
   * bit is set in the shrinkers map.
   *
   * This value is used by the function to detect registering
   * shrinkers and to skip do_shrink_slab() calls for them.
   */
  #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
b4c2b231c   Kirill Tkhai   mm: assign id to ...
176
177
178
179
180
181
182
183
184
  static DEFINE_IDR(shrinker_idr);
  static int shrinker_nr_max;
  
  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  {
  	int id, ret = -ENOMEM;
  
  	down_write(&shrinker_rwsem);
  	/* This may call shrinker, so it must use down_read_trylock() */
7e010df53   Kirill Tkhai   mm: use special v...
185
  	id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
b4c2b231c   Kirill Tkhai   mm: assign id to ...
186
187
  	if (id < 0)
  		goto unlock;
0a4465d34   Kirill Tkhai   mm, memcg: assign...
188
189
190
191
192
  	if (id >= shrinker_nr_max) {
  		if (memcg_expand_shrinker_maps(id)) {
  			idr_remove(&shrinker_idr, id);
  			goto unlock;
  		}
b4c2b231c   Kirill Tkhai   mm: assign id to ...
193
  		shrinker_nr_max = id + 1;
0a4465d34   Kirill Tkhai   mm, memcg: assign...
194
  	}
b4c2b231c   Kirill Tkhai   mm: assign id to ...
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  	shrinker->id = id;
  	ret = 0;
  unlock:
  	up_write(&shrinker_rwsem);
  	return ret;
  }
  
  static void unregister_memcg_shrinker(struct shrinker *shrinker)
  {
  	int id = shrinker->id;
  
  	BUG_ON(id < 0);
  
  	down_write(&shrinker_rwsem);
  	idr_remove(&shrinker_idr, id);
  	up_write(&shrinker_rwsem);
  }
  #else /* CONFIG_MEMCG_KMEM */
  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  {
  	return 0;
  }
  
  static void unregister_memcg_shrinker(struct shrinker *shrinker)
  {
  }
  #endif /* CONFIG_MEMCG_KMEM */
c255a4580   Andrew Morton   memcg: rename con...
222
  #ifdef CONFIG_MEMCG
89b5fae53   Johannes Weiner   mm: vmscan: disti...
223
224
  static bool global_reclaim(struct scan_control *sc)
  {
f16015fbf   Johannes Weiner   mm: vmscan: disti...
225
  	return !sc->target_mem_cgroup;
89b5fae53   Johannes Weiner   mm: vmscan: disti...
226
  }
97c9341f7   Tejun Heo   mm: vmscan: disab...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
  
  /**
   * sane_reclaim - is the usual dirty throttling mechanism operational?
   * @sc: scan_control in question
   *
   * The normal page dirty throttling mechanism in balance_dirty_pages() is
   * completely broken with the legacy memcg and direct stalling in
   * shrink_page_list() is used for throttling instead, which lacks all the
   * niceties such as fairness, adaptive pausing, bandwidth proportional
   * allocation and configurability.
   *
   * This function tests whether the vmscan currently in progress can assume
   * that the normal dirty throttling mechanism is operational.
   */
  static bool sane_reclaim(struct scan_control *sc)
  {
  	struct mem_cgroup *memcg = sc->target_mem_cgroup;
  
  	if (!memcg)
  		return true;
  #ifdef CONFIG_CGROUP_WRITEBACK
69234acee   Linus Torvalds   Merge branch 'for...
248
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
97c9341f7   Tejun Heo   mm: vmscan: disab...
249
250
251
252
  		return true;
  #endif
  	return false;
  }
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
  
  static void set_memcg_congestion(pg_data_t *pgdat,
  				struct mem_cgroup *memcg,
  				bool congested)
  {
  	struct mem_cgroup_per_node *mn;
  
  	if (!memcg)
  		return;
  
  	mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
  	WRITE_ONCE(mn->congested, congested);
  }
  
  static bool memcg_congested(pg_data_t *pgdat,
  			struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_node *mn;
  
  	mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
  	return READ_ONCE(mn->congested);
  
  }
91a45470f   KAMEZAWA Hiroyuki   per-zone and recl...
276
  #else
89b5fae53   Johannes Weiner   mm: vmscan: disti...
277
278
279
280
  static bool global_reclaim(struct scan_control *sc)
  {
  	return true;
  }
97c9341f7   Tejun Heo   mm: vmscan: disab...
281
282
283
284
285
  
  static bool sane_reclaim(struct scan_control *sc)
  {
  	return true;
  }
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
286
287
288
289
290
291
292
293
294
295
296
297
  
  static inline void set_memcg_congestion(struct pglist_data *pgdat,
  				struct mem_cgroup *memcg, bool congested)
  {
  }
  
  static inline bool memcg_congested(struct pglist_data *pgdat,
  			struct mem_cgroup *memcg)
  {
  	return false;
  
  }
91a45470f   KAMEZAWA Hiroyuki   per-zone and recl...
298
  #endif
5a1c84b40   Mel Gorman   mm: remove reclai...
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
  /*
   * This misses isolated pages which are not accounted for to save counters.
   * As the data only determines if reclaim or compaction continues, it is
   * not expected that isolated pages will be a dominating factor.
   */
  unsigned long zone_reclaimable_pages(struct zone *zone)
  {
  	unsigned long nr;
  
  	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
  		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
  	if (get_nr_swap_pages() > 0)
  		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
  			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  
  	return nr;
  }
fd5388037   Michal Hocko   mm, vmscan: clean...
316
317
318
319
320
321
322
  /**
   * lruvec_lru_size -  Returns the number of pages on the given LRU list.
   * @lruvec: lru vector
   * @lru: lru to use
   * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
   */
  unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
c9f299d98   KOSAKI Motohiro   mm: add zone nr_p...
323
  {
fd5388037   Michal Hocko   mm, vmscan: clean...
324
325
  	unsigned long lru_size;
  	int zid;
c3c787e8c   Hugh Dickins   mm/memcg: scannin...
326
  	if (!mem_cgroup_disabled())
fd5388037   Michal Hocko   mm, vmscan: clean...
327
328
329
  		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
  	else
  		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
a3d8e0549   KOSAKI Motohiro   memcg: add mem_cg...
330

fd5388037   Michal Hocko   mm, vmscan: clean...
331
332
333
  	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
  		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
  		unsigned long size;
c9f299d98   KOSAKI Motohiro   mm: add zone nr_p...
334

fd5388037   Michal Hocko   mm, vmscan: clean...
335
336
337
338
339
340
341
342
343
344
345
346
  		if (!managed_zone(zone))
  			continue;
  
  		if (!mem_cgroup_disabled())
  			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
  		else
  			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
  				       NR_ZONE_LRU_BASE + lru);
  		lru_size -= min(size, lru_size);
  	}
  
  	return lru_size;
b4536f0c8   Michal Hocko   mm, memcg: fix th...
347

b4536f0c8   Michal Hocko   mm, memcg: fix th...
348
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
349
  /*
1d3d4437e   Glauber Costa   vmscan: per-node ...
350
   * Add a shrinker callback to be called from the vm.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
351
   */
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
352
  int prealloc_shrinker(struct shrinker *shrinker)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
  {
1d3d4437e   Glauber Costa   vmscan: per-node ...
354
  	size_t size = sizeof(*shrinker->nr_deferred);
1d3d4437e   Glauber Costa   vmscan: per-node ...
355
356
357
358
359
360
  	if (shrinker->flags & SHRINKER_NUMA_AWARE)
  		size *= nr_node_ids;
  
  	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
  	if (!shrinker->nr_deferred)
  		return -ENOMEM;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
361
362
363
364
365
  
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
  		if (prealloc_memcg_shrinker(shrinker))
  			goto free_deferred;
  	}
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
366
  	return 0;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
367
368
369
370
371
  
  free_deferred:
  	kfree(shrinker->nr_deferred);
  	shrinker->nr_deferred = NULL;
  	return -ENOMEM;
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
372
373
374
375
  }
  
  void free_prealloced_shrinker(struct shrinker *shrinker)
  {
b4c2b231c   Kirill Tkhai   mm: assign id to ...
376
377
378
379
380
  	if (!shrinker->nr_deferred)
  		return;
  
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		unregister_memcg_shrinker(shrinker);
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
381
382
383
  	kfree(shrinker->nr_deferred);
  	shrinker->nr_deferred = NULL;
  }
1d3d4437e   Glauber Costa   vmscan: per-node ...
384

8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
385
386
  void register_shrinker_prepared(struct shrinker *shrinker)
  {
8e1f936b7   Rusty Russell   mm: clean up and ...
387
388
  	down_write(&shrinker_rwsem);
  	list_add_tail(&shrinker->list, &shrinker_list);
7e010df53   Kirill Tkhai   mm: use special v...
389
  #ifdef CONFIG_MEMCG_KMEM
8df4a44cc   Kirill Tkhai   mm: check shrinke...
390
391
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		idr_replace(&shrinker_idr, shrinker, shrinker->id);
7e010df53   Kirill Tkhai   mm: use special v...
392
  #endif
8e1f936b7   Rusty Russell   mm: clean up and ...
393
  	up_write(&shrinker_rwsem);
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
394
395
396
397
398
399
400
401
402
  }
  
  int register_shrinker(struct shrinker *shrinker)
  {
  	int err = prealloc_shrinker(shrinker);
  
  	if (err)
  		return err;
  	register_shrinker_prepared(shrinker);
1d3d4437e   Glauber Costa   vmscan: per-node ...
403
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
  }
8e1f936b7   Rusty Russell   mm: clean up and ...
405
  EXPORT_SYMBOL(register_shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
407
408
409
  
  /*
   * Remove one
   */
8e1f936b7   Rusty Russell   mm: clean up and ...
410
  void unregister_shrinker(struct shrinker *shrinker)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
  {
bb422a738   Tetsuo Handa   mm,vmscan: Make u...
412
413
  	if (!shrinker->nr_deferred)
  		return;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
414
415
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		unregister_memcg_shrinker(shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
416
417
418
  	down_write(&shrinker_rwsem);
  	list_del(&shrinker->list);
  	up_write(&shrinker_rwsem);
ae3933216   Andrew Vagin   mm/vmscan.c: don'...
419
  	kfree(shrinker->nr_deferred);
bb422a738   Tetsuo Handa   mm,vmscan: Make u...
420
  	shrinker->nr_deferred = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421
  }
8e1f936b7   Rusty Russell   mm: clean up and ...
422
  EXPORT_SYMBOL(unregister_shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
424
  
  #define SHRINK_BATCH 128
1d3d4437e   Glauber Costa   vmscan: per-node ...
425

cb731d6c6   Vladimir Davydov   vmscan: per memor...
426
  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
9092c71bb   Josef Bacik   mm: use sc->prior...
427
  				    struct shrinker *shrinker, int priority)
1d3d4437e   Glauber Costa   vmscan: per-node ...
428
429
430
431
  {
  	unsigned long freed = 0;
  	unsigned long long delta;
  	long total_scan;
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
432
  	long freeable;
1d3d4437e   Glauber Costa   vmscan: per-node ...
433
434
435
436
437
  	long nr;
  	long new_nr;
  	int nid = shrinkctl->nid;
  	long batch_size = shrinker->batch ? shrinker->batch
  					  : SHRINK_BATCH;
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
438
  	long scanned = 0, next_deferred;
1d3d4437e   Glauber Costa   vmscan: per-node ...
439

ac7fb3ad2   Kirill Tkhai   mm/vmscan.c: move...
440
441
  	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  		nid = 0;
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
442
  	freeable = shrinker->count_objects(shrinker, shrinkctl);
9b996468c   Kirill Tkhai   mm: add SHRINK_EM...
443
444
  	if (freeable == 0 || freeable == SHRINK_EMPTY)
  		return freeable;
1d3d4437e   Glauber Costa   vmscan: per-node ...
445
446
447
448
449
450
451
452
453
  
  	/*
  	 * copy the current shrinker scan count into a local variable
  	 * and zero it so that other concurrent shrinker invocations
  	 * don't also do this scanning work.
  	 */
  	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
  
  	total_scan = nr;
9092c71bb   Josef Bacik   mm: use sc->prior...
454
455
456
  	delta = freeable >> priority;
  	delta *= 4;
  	do_div(delta, shrinker->seeks);
172b06c32   Roman Gushchin   mm: slowly shrink...
457

1d3d4437e   Glauber Costa   vmscan: per-node ...
458
459
  	total_scan += delta;
  	if (total_scan < 0) {
8612c6639   Pintu Kumar   mm/vmscan.c: repl...
460
461
  		pr_err("shrink_slab: %pF negative objects to delete nr=%ld
  ",
a0b02131c   Dave Chinner   shrinker: Kill ol...
462
  		       shrinker->scan_objects, total_scan);
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
463
  		total_scan = freeable;
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
464
465
466
  		next_deferred = nr;
  	} else
  		next_deferred = total_scan;
1d3d4437e   Glauber Costa   vmscan: per-node ...
467
468
469
470
471
472
473
  
  	/*
  	 * We need to avoid excessive windup on filesystem shrinkers
  	 * due to large numbers of GFP_NOFS allocations causing the
  	 * shrinkers to return -1 all the time. This results in a large
  	 * nr being built up so when a shrink that can do some work
  	 * comes along it empties the entire cache due to nr >>>
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
474
  	 * freeable. This is bad for sustaining a working set in
1d3d4437e   Glauber Costa   vmscan: per-node ...
475
476
477
478
479
  	 * memory.
  	 *
  	 * Hence only allow the shrinker to scan the entire cache when
  	 * a large delta change is calculated directly.
  	 */
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
480
481
  	if (delta < freeable / 4)
  		total_scan = min(total_scan, freeable / 2);
1d3d4437e   Glauber Costa   vmscan: per-node ...
482
483
484
485
486
487
  
  	/*
  	 * Avoid risking looping forever due to too large nr value:
  	 * never try to free more than twice the estimate number of
  	 * freeable entries.
  	 */
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
488
489
  	if (total_scan > freeable * 2)
  		total_scan = freeable * 2;
1d3d4437e   Glauber Costa   vmscan: per-node ...
490
491
  
  	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
9092c71bb   Josef Bacik   mm: use sc->prior...
492
  				   freeable, delta, total_scan, priority);
1d3d4437e   Glauber Costa   vmscan: per-node ...
493

0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
494
495
496
497
498
499
500
501
502
503
504
  	/*
  	 * Normally, we should not scan less than batch_size objects in one
  	 * pass to avoid too frequent shrinker calls, but if the slab has less
  	 * than batch_size objects in total and we are really tight on memory,
  	 * we will try to reclaim all available objects, otherwise we can end
  	 * up failing allocations although there are plenty of reclaimable
  	 * objects spread over several slabs with usage less than the
  	 * batch_size.
  	 *
  	 * We detect the "tight on memory" situations by looking at the total
  	 * number of objects we want to scan (total_scan). If it is greater
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
505
  	 * than the total number of objects on slab (freeable), we must be
0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
506
507
508
509
  	 * scanning at high prio and therefore should try to reclaim as much as
  	 * possible.
  	 */
  	while (total_scan >= batch_size ||
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
510
  	       total_scan >= freeable) {
a0b02131c   Dave Chinner   shrinker: Kill ol...
511
  		unsigned long ret;
0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
512
  		unsigned long nr_to_scan = min(batch_size, total_scan);
1d3d4437e   Glauber Costa   vmscan: per-node ...
513

0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
514
  		shrinkctl->nr_to_scan = nr_to_scan;
d460acb5b   Chris Wilson   mm: track actual ...
515
  		shrinkctl->nr_scanned = nr_to_scan;
a0b02131c   Dave Chinner   shrinker: Kill ol...
516
517
518
519
  		ret = shrinker->scan_objects(shrinker, shrinkctl);
  		if (ret == SHRINK_STOP)
  			break;
  		freed += ret;
1d3d4437e   Glauber Costa   vmscan: per-node ...
520

d460acb5b   Chris Wilson   mm: track actual ...
521
522
523
  		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
  		total_scan -= shrinkctl->nr_scanned;
  		scanned += shrinkctl->nr_scanned;
1d3d4437e   Glauber Costa   vmscan: per-node ...
524
525
526
  
  		cond_resched();
  	}
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
527
528
529
530
  	if (next_deferred >= scanned)
  		next_deferred -= scanned;
  	else
  		next_deferred = 0;
1d3d4437e   Glauber Costa   vmscan: per-node ...
531
532
533
534
535
  	/*
  	 * move the unused scan count back into the shrinker in a
  	 * manner that handles concurrent updates. If we exhausted the
  	 * scan, there is no need to do an update.
  	 */
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
536
537
  	if (next_deferred > 0)
  		new_nr = atomic_long_add_return(next_deferred,
1d3d4437e   Glauber Costa   vmscan: per-node ...
538
539
540
  						&shrinker->nr_deferred[nid]);
  	else
  		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
df9024a8c   Dave Hansen   mm: shrinker: add...
541
  	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
1d3d4437e   Glauber Costa   vmscan: per-node ...
542
  	return freed;
1495f230f   Ying Han   vmscan: change sh...
543
  }
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
544
545
546
547
548
  #ifdef CONFIG_MEMCG_KMEM
  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  			struct mem_cgroup *memcg, int priority)
  {
  	struct memcg_shrinker_map *map;
b8e57efa2   Kirill Tkhai   mm/vmscan.c: fix ...
549
550
  	unsigned long ret, freed = 0;
  	int i;
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
  
  	if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
  		return 0;
  
  	if (!down_read_trylock(&shrinker_rwsem))
  		return 0;
  
  	map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
  					true);
  	if (unlikely(!map))
  		goto unlock;
  
  	for_each_set_bit(i, map->map, shrinker_nr_max) {
  		struct shrink_control sc = {
  			.gfp_mask = gfp_mask,
  			.nid = nid,
  			.memcg = memcg,
  		};
  		struct shrinker *shrinker;
  
  		shrinker = idr_find(&shrinker_idr, i);
7e010df53   Kirill Tkhai   mm: use special v...
572
573
574
  		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
  			if (!shrinker)
  				clear_bit(i, map->map);
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
575
576
  			continue;
  		}
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
577
  		ret = do_shrink_slab(&sc, shrinker, priority);
f90280d6b   Kirill Tkhai   mm/vmscan.c: clea...
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
  		if (ret == SHRINK_EMPTY) {
  			clear_bit(i, map->map);
  			/*
  			 * After the shrinker reported that it had no objects to
  			 * free, but before we cleared the corresponding bit in
  			 * the memcg shrinker map, a new object might have been
  			 * added. To make sure, we have the bit set in this
  			 * case, we invoke the shrinker one more time and reset
  			 * the bit if it reports that it is not empty anymore.
  			 * The memory barrier here pairs with the barrier in
  			 * memcg_set_shrinker_bit():
  			 *
  			 * list_lru_add()     shrink_slab_memcg()
  			 *   list_add_tail()    clear_bit()
  			 *   <MB>               <MB>
  			 *   set_bit()          do_shrink_slab()
  			 */
  			smp_mb__after_atomic();
  			ret = do_shrink_slab(&sc, shrinker, priority);
  			if (ret == SHRINK_EMPTY)
  				ret = 0;
  			else
  				memcg_set_shrinker_bit(memcg, nid, i);
  		}
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
  		freed += ret;
  
  		if (rwsem_is_contended(&shrinker_rwsem)) {
  			freed = freed ? : 1;
  			break;
  		}
  	}
  unlock:
  	up_read(&shrinker_rwsem);
  	return freed;
  }
  #else /* CONFIG_MEMCG_KMEM */
  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  			struct mem_cgroup *memcg, int priority)
  {
  	return 0;
  }
  #endif /* CONFIG_MEMCG_KMEM */
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
620
  /**
cb731d6c6   Vladimir Davydov   vmscan: per memor...
621
   * shrink_slab - shrink slab caches
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
622
623
   * @gfp_mask: allocation context
   * @nid: node whose slab caches to target
cb731d6c6   Vladimir Davydov   vmscan: per memor...
624
   * @memcg: memory cgroup whose slab caches to target
9092c71bb   Josef Bacik   mm: use sc->prior...
625
   * @priority: the reclaim priority
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
626
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
627
   * Call the shrink functions to age shrinkable caches.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
628
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
629
630
   * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
   * unaware shrinkers will receive a node id of 0 instead.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
631
   *
aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
632
633
   * @memcg specifies the memory cgroup to target. Unaware shrinkers
   * are called only if it is the root cgroup.
cb731d6c6   Vladimir Davydov   vmscan: per memor...
634
   *
9092c71bb   Josef Bacik   mm: use sc->prior...
635
636
   * @priority is sc->priority, we take the number of objects and >> by priority
   * in order to get the scan target.
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
637
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
638
   * Returns the number of reclaimed slab objects.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
639
   */
cb731d6c6   Vladimir Davydov   vmscan: per memor...
640
641
  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
  				 struct mem_cgroup *memcg,
9092c71bb   Josef Bacik   mm: use sc->prior...
642
  				 int priority)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
643
  {
b8e57efa2   Kirill Tkhai   mm/vmscan.c: fix ...
644
  	unsigned long ret, freed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
645
  	struct shrinker *shrinker;
aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
646
  	if (!mem_cgroup_is_root(memcg))
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
647
  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
648

e830c63a6   Tetsuo Handa   mm,vmscan: don't ...
649
  	if (!down_read_trylock(&shrinker_rwsem))
f06590bd7   Minchan Kim   mm: vmscan: corre...
650
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
651
652
  
  	list_for_each_entry(shrinker, &shrinker_list, list) {
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
653
654
655
  		struct shrink_control sc = {
  			.gfp_mask = gfp_mask,
  			.nid = nid,
cb731d6c6   Vladimir Davydov   vmscan: per memor...
656
  			.memcg = memcg,
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
657
  		};
ec97097bc   Vladimir Davydov   mm: vmscan: call ...
658

9b996468c   Kirill Tkhai   mm: add SHRINK_EM...
659
660
661
662
  		ret = do_shrink_slab(&sc, shrinker, priority);
  		if (ret == SHRINK_EMPTY)
  			ret = 0;
  		freed += ret;
e496612c5   Minchan Kim   mm: do not stall ...
663
664
665
666
667
668
669
670
671
  		/*
  		 * Bail out if someone want to register a new shrinker to
  		 * prevent the regsitration from being stalled for long periods
  		 * by parallel ongoing shrinking.
  		 */
  		if (rwsem_is_contended(&shrinker_rwsem)) {
  			freed = freed ? : 1;
  			break;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
672
  	}
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
673

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
674
  	up_read(&shrinker_rwsem);
f06590bd7   Minchan Kim   mm: vmscan: corre...
675
676
  out:
  	cond_resched();
24f7c6b98   Dave Chinner   mm: new shrinker API
677
  	return freed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
678
  }
cb731d6c6   Vladimir Davydov   vmscan: per memor...
679
680
681
682
683
684
685
686
  void drop_slab_node(int nid)
  {
  	unsigned long freed;
  
  	do {
  		struct mem_cgroup *memcg = NULL;
  
  		freed = 0;
aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
687
  		memcg = mem_cgroup_iter(NULL, NULL, NULL);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
688
  		do {
9092c71bb   Josef Bacik   mm: use sc->prior...
689
  			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
690
691
692
693
694
695
696
697
698
699
700
  		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  	} while (freed > 10);
  }
  
  void drop_slab(void)
  {
  	int nid;
  
  	for_each_online_node(nid)
  		drop_slab_node(nid);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
702
  static inline int is_page_cache_freeable(struct page *page)
  {
ceddc3a52   Johannes Weiner   mm: document is_p...
703
704
705
706
707
  	/*
  	 * A freeable page cache page is referenced only by the caller
  	 * that isolated the page, the page cache radix tree and
  	 * optional buffer heads at page->private.
  	 */
bd4c82c22   Huang Ying   mm, THP, swap: de...
708
709
710
  	int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
  		HPAGE_PMD_NR : 1;
  	return page_count(page) - page_has_private(page) == 1 + radix_pins;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
711
  }
703c27088   Tejun Heo   writeback: implem...
712
  static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
  {
930d91525   Christoph Lameter   [PATCH] Swap Migr...
714
  	if (current->flags & PF_SWAPWRITE)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
715
  		return 1;
703c27088   Tejun Heo   writeback: implem...
716
  	if (!inode_write_congested(inode))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717
  		return 1;
703c27088   Tejun Heo   writeback: implem...
718
  	if (inode_to_bdi(inode) == current->backing_dev_info)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
  		return 1;
  	return 0;
  }
  
  /*
   * We detected a synchronous write error writing a page out.  Probably
   * -ENOSPC.  We need to propagate that into the address_space for a subsequent
   * fsync(), msync() or close().
   *
   * The tricky part is that after writepage we cannot touch the mapping: nothing
   * prevents it from being freed up.  But we have a ref on the page and once
   * that page is locked, the mapping is pinned.
   *
   * We're allowed to run sleeping lock_page() here because we know the caller has
   * __GFP_FS.
   */
  static void handle_write_error(struct address_space *mapping,
  				struct page *page, int error)
  {
7eaceacca   Jens Axboe   block: remove per...
738
  	lock_page(page);
3e9f45bd1   Guillaume Chazarain   Factor outstandin...
739
740
  	if (page_mapping(page) == mapping)
  		mapping_set_error(mapping, error);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
741
742
  	unlock_page(page);
  }
04e62a29b   Christoph Lameter   [PATCH] More page...
743
744
745
746
747
748
749
750
751
752
753
  /* possible outcome of pageout() */
  typedef enum {
  	/* failed to write page out, page is locked */
  	PAGE_KEEP,
  	/* move page to the active list, page is locked */
  	PAGE_ACTIVATE,
  	/* page has been sent to the disk successfully, page is unlocked */
  	PAGE_SUCCESS,
  	/* page is clean and locked */
  	PAGE_CLEAN,
  } pageout_t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
754
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
755
756
   * pageout is called by shrink_page_list() for each dirty page.
   * Calls ->writepage().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
757
   */
c661b078f   Andy Whitcroft   synchronous lumpy...
758
  static pageout_t pageout(struct page *page, struct address_space *mapping,
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
759
  			 struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
760
761
762
763
764
765
766
767
  {
  	/*
  	 * If the page is dirty, only perform writeback if that write
  	 * will be non-blocking.  To prevent this allocation from being
  	 * stalled by pagecache activity.  But note that there may be
  	 * stalls if we need to run get_block().  We could test
  	 * PagePrivate for that.
  	 *
8174202b3   Al Viro   write_iter varian...
768
  	 * If this process is currently in __generic_file_write_iter() against
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
769
770
771
772
773
774
775
  	 * this page's queue, we can perform writeback even if that
  	 * will block.
  	 *
  	 * If the page is swapcache, write it back even if that would
  	 * block, for some throttling. This happens by accident, because
  	 * swap_backing_dev_info is bust: it doesn't reflect the
  	 * congestion state of the swapdevs.  Easy to fix, if needed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776
777
778
779
780
781
782
783
  	 */
  	if (!is_page_cache_freeable(page))
  		return PAGE_KEEP;
  	if (!mapping) {
  		/*
  		 * Some data journaling orphaned pages can have
  		 * page->mapping == NULL while being dirty with clean buffers.
  		 */
266cf658e   David Howells   FS-Cache: Recruit...
784
  		if (page_has_private(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
785
786
  			if (try_to_free_buffers(page)) {
  				ClearPageDirty(page);
b1de0d139   Mitchel Humpherys   mm: convert some ...
787
788
  				pr_info("%s: orphaned page
  ", __func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
789
790
791
792
793
794
795
  				return PAGE_CLEAN;
  			}
  		}
  		return PAGE_KEEP;
  	}
  	if (mapping->a_ops->writepage == NULL)
  		return PAGE_ACTIVATE;
703c27088   Tejun Heo   writeback: implem...
796
  	if (!may_write_to_inode(mapping->host, sc))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
797
798
799
800
801
802
803
  		return PAGE_KEEP;
  
  	if (clear_page_dirty_for_io(page)) {
  		int res;
  		struct writeback_control wbc = {
  			.sync_mode = WB_SYNC_NONE,
  			.nr_to_write = SWAP_CLUSTER_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
804
805
  			.range_start = 0,
  			.range_end = LLONG_MAX,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
806
807
808
809
810
811
812
  			.for_reclaim = 1,
  		};
  
  		SetPageReclaim(page);
  		res = mapping->a_ops->writepage(page, &wbc);
  		if (res < 0)
  			handle_write_error(mapping, page, res);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
813
  		if (res == AOP_WRITEPAGE_ACTIVATE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
814
815
816
  			ClearPageReclaim(page);
  			return PAGE_ACTIVATE;
  		}
c661b078f   Andy Whitcroft   synchronous lumpy...
817

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
818
819
820
821
  		if (!PageWriteback(page)) {
  			/* synchronous write or broken a_ops? */
  			ClearPageReclaim(page);
  		}
3aa238511   yalin wang   mm/vmscan.c: chan...
822
  		trace_mm_vmscan_writepage(page);
c4a25635b   Mel Gorman   mm: move vmscan w...
823
  		inc_node_page_state(page, NR_VMSCAN_WRITE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
824
825
826
827
828
  		return PAGE_SUCCESS;
  	}
  
  	return PAGE_CLEAN;
  }
a649fd927   Andrew Morton   [PATCH] invalidat...
829
  /*
e286781d5   Nick Piggin   mm: speculative p...
830
831
   * Same as remove_mapping, but if the page is removed from the mapping, it
   * gets returned with a refcount of 0.
a649fd927   Andrew Morton   [PATCH] invalidat...
832
   */
a528910e1   Johannes Weiner   mm: thrash detect...
833
834
  static int __remove_mapping(struct address_space *mapping, struct page *page,
  			    bool reclaimed)
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
835
  {
c4843a759   Greg Thelen   memcg: add per cg...
836
  	unsigned long flags;
bd4c82c22   Huang Ying   mm, THP, swap: de...
837
  	int refcount;
c4843a759   Greg Thelen   memcg: add per cg...
838

28e4d965e   Nick Piggin   [PATCH] mm: remov...
839
840
  	BUG_ON(!PageLocked(page));
  	BUG_ON(mapping != page_mapping(page));
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
841

b93b01631   Matthew Wilcox   page cache: use x...
842
  	xa_lock_irqsave(&mapping->i_pages, flags);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
843
  	/*
0fd0e6b05   Nick Piggin   [PATCH] page inva...
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  	 * The non racy check for a busy page.
  	 *
  	 * Must be careful with the order of the tests. When someone has
  	 * a ref to the page, it may be possible that they dirty it then
  	 * drop the reference. So if PageDirty is tested before page_count
  	 * here, then the following race may occur:
  	 *
  	 * get_user_pages(&page);
  	 * [user mapping goes away]
  	 * write_to(page);
  	 *				!PageDirty(page)    [good]
  	 * SetPageDirty(page);
  	 * put_page(page);
  	 *				!page_count(page)   [good, discard it]
  	 *
  	 * [oops, our write_to data is lost]
  	 *
  	 * Reversing the order of the tests ensures such a situation cannot
  	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
0139aa7b7   Joonsoo Kim   mm: rename _count...
863
  	 * load is not satisfied before that of page->_refcount.
0fd0e6b05   Nick Piggin   [PATCH] page inva...
864
865
  	 *
  	 * Note that if SetPageDirty is always performed via set_page_dirty,
b93b01631   Matthew Wilcox   page cache: use x...
866
  	 * and thus under the i_pages lock, then this ordering is not required.
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
867
  	 */
bd4c82c22   Huang Ying   mm, THP, swap: de...
868
869
870
871
872
  	if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
  		refcount = 1 + HPAGE_PMD_NR;
  	else
  		refcount = 2;
  	if (!page_ref_freeze(page, refcount))
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
873
  		goto cannot_free;
1c4c3b99c   Jiang Biao   mm: fix page_free...
874
  	/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
e286781d5   Nick Piggin   mm: speculative p...
875
  	if (unlikely(PageDirty(page))) {
bd4c82c22   Huang Ying   mm, THP, swap: de...
876
  		page_ref_unfreeze(page, refcount);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
877
  		goto cannot_free;
e286781d5   Nick Piggin   mm: speculative p...
878
  	}
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
879
880
881
  
  	if (PageSwapCache(page)) {
  		swp_entry_t swap = { .val = page_private(page) };
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
882
  		mem_cgroup_swapout(page, swap);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
883
  		__delete_from_swap_cache(page);
b93b01631   Matthew Wilcox   page cache: use x...
884
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
75f6d6d29   Minchan Kim   mm, THP, swap: un...
885
  		put_swap_page(page, swap);
e286781d5   Nick Piggin   mm: speculative p...
886
  	} else {
6072d13c4   Linus Torvalds   Call the filesyst...
887
  		void (*freepage)(struct page *);
a528910e1   Johannes Weiner   mm: thrash detect...
888
  		void *shadow = NULL;
6072d13c4   Linus Torvalds   Call the filesyst...
889
890
  
  		freepage = mapping->a_ops->freepage;
a528910e1   Johannes Weiner   mm: thrash detect...
891
892
893
894
895
896
897
898
899
  		/*
  		 * Remember a shadow entry for reclaimed file cache in
  		 * order to detect refaults, thus thrashing, later on.
  		 *
  		 * But don't store shadows in an address space that is
  		 * already exiting.  This is not just an optizimation,
  		 * inode reclaim needs to empty out the radix tree or
  		 * the nodes are lost.  Don't plant shadows behind its
  		 * back.
f9fe48bec   Ross Zwisler   dax: support dirt...
900
901
902
903
904
  		 *
  		 * We also don't store shadows for DAX mappings because the
  		 * only page cache pages found in these are zero pages
  		 * covering holes, and because we don't want to mix DAX
  		 * exceptional entries and shadow exceptional entries in the
b93b01631   Matthew Wilcox   page cache: use x...
905
  		 * same address_space.
a528910e1   Johannes Weiner   mm: thrash detect...
906
907
  		 */
  		if (reclaimed && page_is_file_cache(page) &&
f9fe48bec   Ross Zwisler   dax: support dirt...
908
  		    !mapping_exiting(mapping) && !dax_mapping(mapping))
a528910e1   Johannes Weiner   mm: thrash detect...
909
  			shadow = workingset_eviction(mapping, page);
62cccb8c8   Johannes Weiner   mm: simplify lock...
910
  		__delete_from_page_cache(page, shadow);
b93b01631   Matthew Wilcox   page cache: use x...
911
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
912
913
914
  
  		if (freepage != NULL)
  			freepage(page);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
915
  	}
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
916
917
918
  	return 1;
  
  cannot_free:
b93b01631   Matthew Wilcox   page cache: use x...
919
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
920
921
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
922
  /*
e286781d5   Nick Piggin   mm: speculative p...
923
924
925
926
927
928
929
   * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
   * someone else has a ref on the page, abort and return 0.  If it was
   * successfully detached, return 1.  Assumes the caller has a single ref on
   * this page.
   */
  int remove_mapping(struct address_space *mapping, struct page *page)
  {
a528910e1   Johannes Weiner   mm: thrash detect...
930
  	if (__remove_mapping(mapping, page, false)) {
e286781d5   Nick Piggin   mm: speculative p...
931
932
933
934
935
  		/*
  		 * Unfreezing the refcount with 1 rather than 2 effectively
  		 * drops the pagecache ref for us without requiring another
  		 * atomic operation.
  		 */
fe896d187   Joonsoo Kim   mm: introduce pag...
936
  		page_ref_unfreeze(page, 1);
e286781d5   Nick Piggin   mm: speculative p...
937
938
939
940
  		return 1;
  	}
  	return 0;
  }
894bc3104   Lee Schermerhorn   Unevictable LRU I...
941
942
943
944
945
946
947
948
949
  /**
   * putback_lru_page - put previously isolated page onto appropriate LRU list
   * @page: page to be put back to appropriate lru list
   *
   * Add previously isolated @page to appropriate LRU list.
   * Page may still be unevictable for other reasons.
   *
   * lru_lock must not be held, interrupts must be enabled.
   */
894bc3104   Lee Schermerhorn   Unevictable LRU I...
950
951
  void putback_lru_page(struct page *page)
  {
9c4e6b1a7   Shakeel Butt   mm, mlock, vmscan...
952
  	lru_cache_add(page);
894bc3104   Lee Schermerhorn   Unevictable LRU I...
953
954
  	put_page(page);		/* drop ref from isolate */
  }
dfc8d636c   Johannes Weiner   vmscan: factor ou...
955
956
957
  enum page_references {
  	PAGEREF_RECLAIM,
  	PAGEREF_RECLAIM_CLEAN,
645747462   Johannes Weiner   vmscan: detect ma...
958
  	PAGEREF_KEEP,
dfc8d636c   Johannes Weiner   vmscan: factor ou...
959
960
961
962
963
964
  	PAGEREF_ACTIVATE,
  };
  
  static enum page_references page_check_references(struct page *page,
  						  struct scan_control *sc)
  {
645747462   Johannes Weiner   vmscan: detect ma...
965
  	int referenced_ptes, referenced_page;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
966
  	unsigned long vm_flags;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
967

c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
968
969
  	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
  					  &vm_flags);
645747462   Johannes Weiner   vmscan: detect ma...
970
  	referenced_page = TestClearPageReferenced(page);
dfc8d636c   Johannes Weiner   vmscan: factor ou...
971

dfc8d636c   Johannes Weiner   vmscan: factor ou...
972
973
974
975
976
977
  	/*
  	 * Mlock lost the isolation race with us.  Let try_to_unmap()
  	 * move the page to the unevictable list.
  	 */
  	if (vm_flags & VM_LOCKED)
  		return PAGEREF_RECLAIM;
645747462   Johannes Weiner   vmscan: detect ma...
978
  	if (referenced_ptes) {
e48982734   Michal Hocko   mm: consider all ...
979
  		if (PageSwapBacked(page))
645747462   Johannes Weiner   vmscan: detect ma...
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
  			return PAGEREF_ACTIVATE;
  		/*
  		 * All mapped pages start out with page table
  		 * references from the instantiating fault, so we need
  		 * to look twice if a mapped file page is used more
  		 * than once.
  		 *
  		 * Mark it and spare it for another trip around the
  		 * inactive list.  Another page table reference will
  		 * lead to its activation.
  		 *
  		 * Note: the mark is set for activated pages as well
  		 * so that recently deactivated but used pages are
  		 * quickly recovered.
  		 */
  		SetPageReferenced(page);
34dbc67a6   Konstantin Khlebnikov   vmscan: promote s...
996
  		if (referenced_page || referenced_ptes > 1)
645747462   Johannes Weiner   vmscan: detect ma...
997
  			return PAGEREF_ACTIVATE;
c909e9936   Konstantin Khlebnikov   vmscan: activate ...
998
999
1000
1001
1002
  		/*
  		 * Activate file-backed executable pages after first usage.
  		 */
  		if (vm_flags & VM_EXEC)
  			return PAGEREF_ACTIVATE;
645747462   Johannes Weiner   vmscan: detect ma...
1003
1004
  		return PAGEREF_KEEP;
  	}
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1005
1006
  
  	/* Reclaim if clean, defer dirty pages to writeback */
2e30244a7   KOSAKI Motohiro   vmscan,tmpfs: tre...
1007
  	if (referenced_page && !PageSwapBacked(page))
645747462   Johannes Weiner   vmscan: detect ma...
1008
1009
1010
  		return PAGEREF_RECLAIM_CLEAN;
  
  	return PAGEREF_RECLAIM;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1011
  }
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1012
1013
1014
1015
  /* Check if a page is dirty or under writeback */
  static void page_check_dirty_writeback(struct page *page,
  				       bool *dirty, bool *writeback)
  {
b45972265   Mel Gorman   mm: vmscan: take ...
1016
  	struct address_space *mapping;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1017
1018
1019
1020
  	/*
  	 * Anonymous pages are not handled by flushers and must be written
  	 * from reclaim context. Do not stall reclaim based on them
  	 */
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1021
1022
  	if (!page_is_file_cache(page) ||
  	    (PageAnon(page) && !PageSwapBacked(page))) {
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1023
1024
1025
1026
1027
1028
1029
1030
  		*dirty = false;
  		*writeback = false;
  		return;
  	}
  
  	/* By default assume that the page flags are accurate */
  	*dirty = PageDirty(page);
  	*writeback = PageWriteback(page);
b45972265   Mel Gorman   mm: vmscan: take ...
1031
1032
1033
1034
1035
1036
1037
1038
  
  	/* Verify dirty/writeback state if the filesystem supports it */
  	if (!page_has_private(page))
  		return;
  
  	mapping = page_mapping(page);
  	if (mapping && mapping->a_ops->is_dirty_writeback)
  		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1039
  }
e286781d5   Nick Piggin   mm: speculative p...
1040
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
1041
   * shrink_page_list() returns the number of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1042
   */
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
1043
  static unsigned long shrink_page_list(struct list_head *page_list,
599d0c954   Mel Gorman   mm, vmscan: move ...
1044
  				      struct pglist_data *pgdat,
f84f6e2b0   Mel Gorman   mm: vmscan: do no...
1045
  				      struct scan_control *sc,
02c6de8d7   Minchan Kim   mm: cma: discard ...
1046
  				      enum ttu_flags ttu_flags,
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1047
  				      struct reclaim_stat *stat,
02c6de8d7   Minchan Kim   mm: cma: discard ...
1048
  				      bool force_reclaim)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1049
1050
  {
  	LIST_HEAD(ret_pages);
abe4c3b50   Mel Gorman   vmscan: set up pa...
1051
  	LIST_HEAD(free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1052
  	int pgactivate = 0;
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1053
1054
1055
1056
1057
1058
  	unsigned nr_unqueued_dirty = 0;
  	unsigned nr_dirty = 0;
  	unsigned nr_congested = 0;
  	unsigned nr_reclaimed = 0;
  	unsigned nr_writeback = 0;
  	unsigned nr_immediate = 0;
5bccd1665   Michal Hocko   mm, vmscan: enhan...
1059
1060
  	unsigned nr_ref_keep = 0;
  	unsigned nr_unmap_fail = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1061
1062
  
  	cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1063
1064
1065
1066
  	while (!list_empty(page_list)) {
  		struct address_space *mapping;
  		struct page *page;
  		int may_enter_fs;
02c6de8d7   Minchan Kim   mm: cma: discard ...
1067
  		enum page_references references = PAGEREF_RECLAIM_CLEAN;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1068
  		bool dirty, writeback;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069
1070
1071
1072
1073
  
  		cond_resched();
  
  		page = lru_to_page(page_list);
  		list_del(&page->lru);
529ae9aaa   Nick Piggin   mm: rename page t...
1074
  		if (!trylock_page(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1075
  			goto keep;
309381fea   Sasha Levin   mm: dump page whe...
1076
  		VM_BUG_ON_PAGE(PageActive(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1077
1078
  
  		sc->nr_scanned++;
80e434260   Christoph Lameter   [PATCH] zone recl...
1079

39b5f29ac   Hugh Dickins   mm: remove vma ar...
1080
  		if (unlikely(!page_evictable(page)))
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1081
  			goto activate_locked;
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1082

a6dc60f89   Johannes Weiner   vmscan: rename sc...
1083
  		if (!sc->may_unmap && page_mapped(page))
80e434260   Christoph Lameter   [PATCH] zone recl...
1084
  			goto keep_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1085
  		/* Double the slab pressure for mapped and swapcache pages */
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1086
1087
  		if ((page_mapped(page) || PageSwapCache(page)) &&
  		    !(PageAnon(page) && !PageSwapBacked(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1088
  			sc->nr_scanned++;
c661b078f   Andy Whitcroft   synchronous lumpy...
1089
1090
  		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
283aba9f9   Mel Gorman   mm: vmscan: block...
1091
  		/*
894befec4   Andrey Ryabinin   mm/vmscan: update...
1092
  		 * The number of dirty pages determines if a node is marked
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
  		 * reclaim_congested which affects wait_iff_congested. kswapd
  		 * will stall and start writing pages if the tail of the LRU
  		 * is all dirty unqueued pages.
  		 */
  		page_check_dirty_writeback(page, &dirty, &writeback);
  		if (dirty || writeback)
  			nr_dirty++;
  
  		if (dirty && !writeback)
  			nr_unqueued_dirty++;
d04e8acd0   Mel Gorman   mm: vmscan: treat...
1103
1104
1105
1106
1107
1108
  		/*
  		 * Treat this page as congested if the underlying BDI is or if
  		 * pages are cycling through the LRU so quickly that the
  		 * pages marked for immediate reclaim are making it to the
  		 * end of the LRU a second time.
  		 */
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1109
  		mapping = page_mapping(page);
1da58ee2a   Jamie Liu   mm: vmscan: count...
1110
  		if (((dirty || writeback) && mapping &&
703c27088   Tejun Heo   writeback: implem...
1111
  		     inode_write_congested(mapping->host)) ||
d04e8acd0   Mel Gorman   mm: vmscan: treat...
1112
  		    (writeback && PageReclaim(page)))
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1113
1114
1115
  			nr_congested++;
  
  		/*
283aba9f9   Mel Gorman   mm: vmscan: block...
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
  		 * If a page at the tail of the LRU is under writeback, there
  		 * are three cases to consider.
  		 *
  		 * 1) If reclaim is encountering an excessive number of pages
  		 *    under writeback and this page is both under writeback and
  		 *    PageReclaim then it indicates that pages are being queued
  		 *    for IO but are being recycled through the LRU before the
  		 *    IO can complete. Waiting on the page itself risks an
  		 *    indefinite stall if it is impossible to writeback the
  		 *    page due to IO error or disconnected storage so instead
b1a6f21e3   Mel Gorman   mm: vmscan: stall...
1126
1127
  		 *    note that the LRU is being scanned too quickly and the
  		 *    caller can stall after page list has been processed.
283aba9f9   Mel Gorman   mm: vmscan: block...
1128
  		 *
97c9341f7   Tejun Heo   mm: vmscan: disab...
1129
  		 * 2) Global or new memcg reclaim encounters a page that is
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1130
1131
1132
  		 *    not marked for immediate reclaim, or the caller does not
  		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
  		 *    not to fs). In this case mark the page for immediate
97c9341f7   Tejun Heo   mm: vmscan: disab...
1133
  		 *    reclaim and continue scanning.
283aba9f9   Mel Gorman   mm: vmscan: block...
1134
  		 *
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1135
1136
  		 *    Require may_enter_fs because we would wait on fs, which
  		 *    may not have submitted IO yet. And the loop driver might
283aba9f9   Mel Gorman   mm: vmscan: block...
1137
1138
1139
1140
1141
  		 *    enter reclaim, and deadlock if it waits on a page for
  		 *    which it is needed to do the write (loop masks off
  		 *    __GFP_IO|__GFP_FS for this reason); but more thought
  		 *    would probably show more reasons.
  		 *
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1142
  		 * 3) Legacy memcg encounters a page that is already marked
283aba9f9   Mel Gorman   mm: vmscan: block...
1143
1144
1145
1146
  		 *    PageReclaim. memcg does not have any dirty pages
  		 *    throttling so we could easily OOM just because too many
  		 *    pages are in writeback and there is nothing else to
  		 *    reclaim. Wait for the writeback to complete.
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1147
1148
1149
1150
1151
1152
1153
1154
1155
  		 *
  		 * In cases 1) and 2) we activate the pages to get them out of
  		 * the way while we continue scanning for clean pages on the
  		 * inactive list and refilling from the active list. The
  		 * observation here is that waiting for disk writes is more
  		 * expensive than potentially causing reloads down the line.
  		 * Since they're marked for immediate reclaim, they won't put
  		 * memory pressure on the cache working set any longer than it
  		 * takes to write them to disk.
283aba9f9   Mel Gorman   mm: vmscan: block...
1156
  		 */
c661b078f   Andy Whitcroft   synchronous lumpy...
1157
  		if (PageWriteback(page)) {
283aba9f9   Mel Gorman   mm: vmscan: block...
1158
1159
1160
  			/* Case 1 above */
  			if (current_is_kswapd() &&
  			    PageReclaim(page) &&
599d0c954   Mel Gorman   mm, vmscan: move ...
1161
  			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
b1a6f21e3   Mel Gorman   mm: vmscan: stall...
1162
  				nr_immediate++;
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1163
  				goto activate_locked;
283aba9f9   Mel Gorman   mm: vmscan: block...
1164
1165
  
  			/* Case 2 above */
97c9341f7   Tejun Heo   mm: vmscan: disab...
1166
  			} else if (sane_reclaim(sc) ||
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1167
  			    !PageReclaim(page) || !may_enter_fs) {
c3b94f44f   Hugh Dickins   memcg: further pr...
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
  				/*
  				 * This is slightly racy - end_page_writeback()
  				 * might have just cleared PageReclaim, then
  				 * setting PageReclaim here end up interpreted
  				 * as PageReadahead - but that does not matter
  				 * enough to care.  What we do want is for this
  				 * page to have PageReclaim set next time memcg
  				 * reclaim reaches the tests above, so it will
  				 * then wait_on_page_writeback() to avoid OOM;
  				 * and it's also appropriate in global reclaim.
  				 */
  				SetPageReclaim(page);
e62e384e9   Michal Hocko   memcg: prevent OO...
1180
  				nr_writeback++;
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1181
  				goto activate_locked;
283aba9f9   Mel Gorman   mm: vmscan: block...
1182
1183
1184
  
  			/* Case 3 above */
  			} else {
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1185
  				unlock_page(page);
283aba9f9   Mel Gorman   mm: vmscan: block...
1186
  				wait_on_page_writeback(page);
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1187
1188
1189
  				/* then go back and try same page again */
  				list_add_tail(&page->lru, page_list);
  				continue;
e62e384e9   Michal Hocko   memcg: prevent OO...
1190
  			}
c661b078f   Andy Whitcroft   synchronous lumpy...
1191
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1192

02c6de8d7   Minchan Kim   mm: cma: discard ...
1193
1194
  		if (!force_reclaim)
  			references = page_check_references(page, sc);
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1195
1196
  		switch (references) {
  		case PAGEREF_ACTIVATE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1197
  			goto activate_locked;
645747462   Johannes Weiner   vmscan: detect ma...
1198
  		case PAGEREF_KEEP:
5bccd1665   Michal Hocko   mm, vmscan: enhan...
1199
  			nr_ref_keep++;
645747462   Johannes Weiner   vmscan: detect ma...
1200
  			goto keep_locked;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1201
1202
1203
1204
  		case PAGEREF_RECLAIM:
  		case PAGEREF_RECLAIM_CLEAN:
  			; /* try to reclaim the page below */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1205

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1206
1207
1208
  		/*
  		 * Anonymous process memory has backing store?
  		 * Try to allocate it some swap space here.
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1209
  		 * Lazyfree page could be freed directly
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1210
  		 */
bd4c82c22   Huang Ying   mm, THP, swap: de...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
  		if (PageAnon(page) && PageSwapBacked(page)) {
  			if (!PageSwapCache(page)) {
  				if (!(sc->gfp_mask & __GFP_IO))
  					goto keep_locked;
  				if (PageTransHuge(page)) {
  					/* cannot split THP, skip it */
  					if (!can_split_huge_page(page, NULL))
  						goto activate_locked;
  					/*
  					 * Split pages without a PMD map right
  					 * away. Chances are some or all of the
  					 * tail pages can be freed without IO.
  					 */
  					if (!compound_mapcount(page) &&
  					    split_huge_page_to_list(page,
  								    page_list))
  						goto activate_locked;
  				}
  				if (!add_to_swap(page)) {
  					if (!PageTransHuge(page))
  						goto activate_locked;
  					/* Fallback to swap normal pages */
  					if (split_huge_page_to_list(page,
  								    page_list))
  						goto activate_locked;
fe490cc0f   Huang Ying   mm, THP, swap: ad...
1236
1237
1238
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  					count_vm_event(THP_SWPOUT_FALLBACK);
  #endif
bd4c82c22   Huang Ying   mm, THP, swap: de...
1239
1240
1241
  					if (!add_to_swap(page))
  						goto activate_locked;
  				}
0f0746589   Minchan Kim   mm, THP, swap: mo...
1242

bd4c82c22   Huang Ying   mm, THP, swap: de...
1243
  				may_enter_fs = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1244

bd4c82c22   Huang Ying   mm, THP, swap: de...
1245
1246
1247
  				/* Adding to swap updated mapping */
  				mapping = page_mapping(page);
  			}
7751b2da6   Kirill A. Shutemov   vmscan: split fil...
1248
1249
1250
1251
  		} else if (unlikely(PageTransHuge(page))) {
  			/* Split file THP */
  			if (split_huge_page_to_list(page, page_list))
  				goto keep_locked;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1252
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1253
1254
1255
1256
1257
  
  		/*
  		 * The page is mapped into the page tables of one or more
  		 * processes. Try to unmap it here.
  		 */
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1258
  		if (page_mapped(page)) {
bd4c82c22   Huang Ying   mm, THP, swap: de...
1259
1260
1261
1262
1263
  			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
  
  			if (unlikely(PageTransHuge(page)))
  				flags |= TTU_SPLIT_HUGE_PMD;
  			if (!try_to_unmap(page, flags)) {
5bccd1665   Michal Hocko   mm, vmscan: enhan...
1264
  				nr_unmap_fail++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1265
  				goto activate_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1266
1267
1268
1269
  			}
  		}
  
  		if (PageDirty(page)) {
ee72886d8   Mel Gorman   mm: vmscan: do no...
1270
  			/*
4eda48235   Johannes Weiner   mm: vmscan: only ...
1271
1272
1273
1274
1275
1276
1277
1278
  			 * Only kswapd can writeback filesystem pages
  			 * to avoid risk of stack overflow. But avoid
  			 * injecting inefficient single-page IO into
  			 * flusher writeback as much as possible: only
  			 * write pages when we've encountered many
  			 * dirty pages, and when we've already scanned
  			 * the rest of the LRU for clean pages and see
  			 * the same dirty pages again (PageReclaim).
ee72886d8   Mel Gorman   mm: vmscan: do no...
1279
  			 */
f84f6e2b0   Mel Gorman   mm: vmscan: do no...
1280
  			if (page_is_file_cache(page) &&
4eda48235   Johannes Weiner   mm: vmscan: only ...
1281
1282
  			    (!current_is_kswapd() || !PageReclaim(page) ||
  			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
49ea7eb65   Mel Gorman   mm: vmscan: immed...
1283
1284
1285
1286
1287
1288
  				/*
  				 * Immediately reclaim when written back.
  				 * Similar in principal to deactivate_page()
  				 * except we already have the page isolated
  				 * and know it's dirty
  				 */
c4a25635b   Mel Gorman   mm: move vmscan w...
1289
  				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
49ea7eb65   Mel Gorman   mm: vmscan: immed...
1290
  				SetPageReclaim(page);
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1291
  				goto activate_locked;
ee72886d8   Mel Gorman   mm: vmscan: do no...
1292
  			}
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1293
  			if (references == PAGEREF_RECLAIM_CLEAN)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
  				goto keep_locked;
4dd4b9202   Andrew Morton   revert "kswapd sh...
1295
  			if (!may_enter_fs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1296
  				goto keep_locked;
52a8363ea   Christoph Lameter   [PATCH] mm: impro...
1297
  			if (!sc->may_writepage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1298
  				goto keep_locked;
d950c9477   Mel Gorman   mm: defer flush o...
1299
1300
1301
1302
1303
1304
  			/*
  			 * Page is dirty. Flush the TLB if a writable entry
  			 * potentially exists to avoid CPU writes after IO
  			 * starts and then write it out here.
  			 */
  			try_to_unmap_flush_dirty();
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1305
  			switch (pageout(page, mapping, sc)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1306
1307
1308
1309
1310
  			case PAGE_KEEP:
  				goto keep_locked;
  			case PAGE_ACTIVATE:
  				goto activate_locked;
  			case PAGE_SUCCESS:
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1311
  				if (PageWriteback(page))
41ac1999c   Mel Gorman   mm: vmscan: do no...
1312
  					goto keep;
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1313
  				if (PageDirty(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1314
  					goto keep;
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1315

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1316
1317
1318
1319
  				/*
  				 * A synchronous write - probably a ramdisk.  Go
  				 * ahead and try to reclaim the page.
  				 */
529ae9aaa   Nick Piggin   mm: rename page t...
1320
  				if (!trylock_page(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
  					goto keep;
  				if (PageDirty(page) || PageWriteback(page))
  					goto keep_locked;
  				mapping = page_mapping(page);
  			case PAGE_CLEAN:
  				; /* try to free the page below */
  			}
  		}
  
  		/*
  		 * If the page has buffers, try to free the buffer mappings
  		 * associated with this page. If we succeed we try to free
  		 * the page as well.
  		 *
  		 * We do this even if the page is PageDirty().
  		 * try_to_release_page() does not perform I/O, but it is
  		 * possible for a page to have PageDirty set, but it is actually
  		 * clean (all its buffers are clean).  This happens if the
  		 * buffers were written out directly, with submit_bh(). ext3
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1340
  		 * will do this, as well as the blockdev mapping.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
  		 * try_to_release_page() will discover that cleanness and will
  		 * drop the buffers and mark the page clean - it can be freed.
  		 *
  		 * Rarely, pages can have buffers and no ->mapping.  These are
  		 * the pages which were not successfully invalidated in
  		 * truncate_complete_page().  We try to drop those buffers here
  		 * and if that worked, and the page is no longer mapped into
  		 * process address space (page_count == 1) it can be freed.
  		 * Otherwise, leave the page on the LRU so it is swappable.
  		 */
266cf658e   David Howells   FS-Cache: Recruit...
1351
  		if (page_has_private(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1352
1353
  			if (!try_to_release_page(page, sc->gfp_mask))
  				goto activate_locked;
e286781d5   Nick Piggin   mm: speculative p...
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
  			if (!mapping && page_count(page) == 1) {
  				unlock_page(page);
  				if (put_page_testzero(page))
  					goto free_it;
  				else {
  					/*
  					 * rare race with speculative reference.
  					 * the speculative reference will free
  					 * this page shortly, so we may
  					 * increment nr_reclaimed here (and
  					 * leave it off the LRU).
  					 */
  					nr_reclaimed++;
  					continue;
  				}
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1370
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1371
1372
1373
1374
1375
1376
1377
1378
  		if (PageAnon(page) && !PageSwapBacked(page)) {
  			/* follow __remove_mapping for reference */
  			if (!page_ref_freeze(page, 1))
  				goto keep_locked;
  			if (PageDirty(page)) {
  				page_ref_unfreeze(page, 1);
  				goto keep_locked;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1379

802a3a92a   Shaohua Li   mm: reclaim MADV_...
1380
  			count_vm_event(PGLAZYFREED);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1381
  			count_memcg_page_event(page, PGLAZYFREED);
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1382
1383
  		} else if (!mapping || !__remove_mapping(mapping, page, true))
  			goto keep_locked;
a978d6f52   Nick Piggin   mm: unlockless re...
1384
1385
1386
1387
1388
1389
1390
  		/*
  		 * At this point, we have no other references and there is
  		 * no way to pick any more up (removed from LRU, removed
  		 * from pagecache). Can use non-atomic bitops now (and
  		 * we obviously don't have to worry about waking up a process
  		 * waiting on the page lock, because there are no references.
  		 */
48c935ad8   Kirill A. Shutemov   page-flags: defin...
1391
  		__ClearPageLocked(page);
e286781d5   Nick Piggin   mm: speculative p...
1392
  free_it:
05ff51376   Andrew Morton   [PATCH] vmscan re...
1393
  		nr_reclaimed++;
abe4c3b50   Mel Gorman   vmscan: set up pa...
1394
1395
1396
1397
1398
  
  		/*
  		 * Is there need to periodically free_page_list? It would
  		 * appear not as the counts should be low
  		 */
bd4c82c22   Huang Ying   mm, THP, swap: de...
1399
1400
1401
1402
1403
  		if (unlikely(PageTransHuge(page))) {
  			mem_cgroup_uncharge(page);
  			(*get_compound_page_dtor(page))(page);
  		} else
  			list_add(&page->lru, &free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1404
1405
1406
  		continue;
  
  activate_locked:
68a22394c   Rik van Riel   vmscan: free swap...
1407
  		/* Not a candidate for swapping, so reclaim swap space. */
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1408
1409
  		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
  						PageMlocked(page)))
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1410
  			try_to_free_swap(page);
309381fea   Sasha Levin   mm: dump page whe...
1411
  		VM_BUG_ON_PAGE(PageActive(page), page);
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1412
1413
1414
  		if (!PageMlocked(page)) {
  			SetPageActive(page);
  			pgactivate++;
2262185c5   Roman Gushchin   mm: per-cgroup me...
1415
  			count_memcg_page_event(page, PGACTIVATE);
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1416
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1417
1418
1419
1420
  keep_locked:
  		unlock_page(page);
  keep:
  		list_add(&page->lru, &ret_pages);
309381fea   Sasha Levin   mm: dump page whe...
1421
  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1422
  	}
abe4c3b50   Mel Gorman   vmscan: set up pa...
1423

747db954c   Johannes Weiner   mm: memcontrol: u...
1424
  	mem_cgroup_uncharge_list(&free_pages);
72b252aed   Mel Gorman   mm: send one IPI ...
1425
  	try_to_unmap_flush();
2d4894b5d   Mel Gorman   mm: remove cold p...
1426
  	free_unref_page_list(&free_pages);
abe4c3b50   Mel Gorman   vmscan: set up pa...
1427

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1428
  	list_splice(&ret_pages, page_list);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1429
  	count_vm_events(PGACTIVATE, pgactivate);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
1430

3c710c1ad   Michal Hocko   mm, vmscan: extra...
1431
1432
1433
1434
1435
1436
  	if (stat) {
  		stat->nr_dirty = nr_dirty;
  		stat->nr_congested = nr_congested;
  		stat->nr_unqueued_dirty = nr_unqueued_dirty;
  		stat->nr_writeback = nr_writeback;
  		stat->nr_immediate = nr_immediate;
5bccd1665   Michal Hocko   mm, vmscan: enhan...
1437
1438
1439
  		stat->nr_activate = pgactivate;
  		stat->nr_ref_keep = nr_ref_keep;
  		stat->nr_unmap_fail = nr_unmap_fail;
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1440
  	}
05ff51376   Andrew Morton   [PATCH] vmscan re...
1441
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1442
  }
02c6de8d7   Minchan Kim   mm: cma: discard ...
1443
1444
1445
1446
1447
1448
1449
1450
  unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  					    struct list_head *page_list)
  {
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
  		.priority = DEF_PRIORITY,
  		.may_unmap = 1,
  	};
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1451
  	unsigned long ret;
02c6de8d7   Minchan Kim   mm: cma: discard ...
1452
1453
1454
1455
  	struct page *page, *next;
  	LIST_HEAD(clean_pages);
  
  	list_for_each_entry_safe(page, next, page_list, lru) {
117aad1e9   Rafael Aquini   mm: avoid reinser...
1456
  		if (page_is_file_cache(page) && !PageDirty(page) &&
b1123ea6d   Minchan Kim   mm: balloon: use ...
1457
  		    !__PageMovable(page)) {
02c6de8d7   Minchan Kim   mm: cma: discard ...
1458
1459
1460
1461
  			ClearPageActive(page);
  			list_move(&page->lru, &clean_pages);
  		}
  	}
599d0c954   Mel Gorman   mm, vmscan: move ...
1462
  	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
a128ca71f   Shaohua Li   mm: delete unnece...
1463
  			TTU_IGNORE_ACCESS, NULL, true);
02c6de8d7   Minchan Kim   mm: cma: discard ...
1464
  	list_splice(&clean_pages, page_list);
599d0c954   Mel Gorman   mm, vmscan: move ...
1465
  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
02c6de8d7   Minchan Kim   mm: cma: discard ...
1466
1467
  	return ret;
  }
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
  /*
   * Attempt to remove the specified page from its LRU.  Only take this page
   * if it is of the appropriate PageActive status.  Pages which are being
   * freed elsewhere are also ignored.
   *
   * page:	page to consider
   * mode:	one of the LRU isolation modes defined above
   *
   * returns 0 on success, -ve errno on failure.
   */
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1478
  int __isolate_lru_page(struct page *page, isolate_mode_t mode)
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1479
1480
1481
1482
1483
1484
  {
  	int ret = -EINVAL;
  
  	/* Only take pages on the LRU. */
  	if (!PageLRU(page))
  		return ret;
e46a28790   Minchan Kim   CMA: migrate mloc...
1485
1486
  	/* Compaction should not handle unevictable pages but CMA can do so */
  	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1487
  		return ret;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1488
  	ret = -EBUSY;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1489

c82449352   Mel Gorman   mm: compaction: m...
1490
1491
1492
1493
1494
  	/*
  	 * To minimise LRU disruption, the caller can indicate that it only
  	 * wants to isolate pages it will be able to operate on without
  	 * blocking - clean pages for the most part.
  	 *
c82449352   Mel Gorman   mm: compaction: m...
1495
1496
1497
  	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
  	 * that it is possible to migrate without blocking
  	 */
1276ad68e   Johannes Weiner   mm: vmscan: scan ...
1498
  	if (mode & ISOLATE_ASYNC_MIGRATE) {
c82449352   Mel Gorman   mm: compaction: m...
1499
1500
1501
1502
1503
1504
  		/* All the caller can do on PageWriteback is block */
  		if (PageWriteback(page))
  			return ret;
  
  		if (PageDirty(page)) {
  			struct address_space *mapping;
69d763fc6   Mel Gorman   mm: pin address_s...
1505
  			bool migrate_dirty;
c82449352   Mel Gorman   mm: compaction: m...
1506

c82449352   Mel Gorman   mm: compaction: m...
1507
1508
1509
  			/*
  			 * Only pages without mappings or that have a
  			 * ->migratepage callback are possible to migrate
69d763fc6   Mel Gorman   mm: pin address_s...
1510
1511
1512
1513
1514
  			 * without blocking. However, we can be racing with
  			 * truncation so it's necessary to lock the page
  			 * to stabilise the mapping as truncation holds
  			 * the page lock until after the page is removed
  			 * from the page cache.
c82449352   Mel Gorman   mm: compaction: m...
1515
  			 */
69d763fc6   Mel Gorman   mm: pin address_s...
1516
1517
  			if (!trylock_page(page))
  				return ret;
c82449352   Mel Gorman   mm: compaction: m...
1518
  			mapping = page_mapping(page);
145e1a71e   Hugh Dickins   mm: fix the NULL ...
1519
  			migrate_dirty = !mapping || mapping->a_ops->migratepage;
69d763fc6   Mel Gorman   mm: pin address_s...
1520
1521
  			unlock_page(page);
  			if (!migrate_dirty)
c82449352   Mel Gorman   mm: compaction: m...
1522
1523
1524
  				return ret;
  		}
  	}
39deaf858   Minchan Kim   mm: compaction: m...
1525

f80c06736   Minchan Kim   mm: zone_reclaim:...
1526
1527
  	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
  		return ret;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
  	if (likely(get_page_unless_zero(page))) {
  		/*
  		 * Be careful not to clear PageLRU until after we're
  		 * sure the page is not being freed elsewhere -- the
  		 * page release code relies on it.
  		 */
  		ClearPageLRU(page);
  		ret = 0;
  	}
  
  	return ret;
  }
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1540
1541
1542
1543
1544
1545
  
  /*
   * Update LRU sizes after isolating pages. The LRU size updates must
   * be complete before mem_cgroup_update_lru_size due to a santity check.
   */
  static __always_inline void update_lru_sizes(struct lruvec *lruvec,
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1546
  			enum lru_list lru, unsigned long *nr_zone_taken)
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1547
  {
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1548
  	int zid;
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1549
1550
1551
1552
1553
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		if (!nr_zone_taken[zid])
  			continue;
  
  		__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1554
  #ifdef CONFIG_MEMCG
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1555
  		mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1556
  #endif
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1557
  	}
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1558
  }
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
1559
  /*
a52633d8e   Mel Gorman   mm, vmscan: move ...
1560
   * zone_lru_lock is heavily contended.  Some of the functions that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1561
1562
1563
1564
1565
1566
1567
1568
   * shrink the lists perform better by taking out a batch of pages
   * and working on them outside the LRU lock.
   *
   * For pagecache intensive workloads, this function is the hottest
   * spot in the kernel (apart from copy_*_user functions).
   *
   * Appropriate locks must be held before calling this function.
   *
791b48b64   Minchan Kim   mm: vmscan: scan ...
1569
   * @nr_to_scan:	The number of eligible pages to look through on the list.
5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1570
   * @lruvec:	The LRU vector to pull pages from.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1571
   * @dst:	The temp list to put pages on to.
f626012db   Hugh Dickins   mm: remove isolat...
1572
   * @nr_scanned:	The number of pages that were scanned.
fe2c2a106   Rik van Riel   vmscan: reclaim a...
1573
   * @sc:		The scan_control struct for this reclaim session
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1574
   * @mode:	One of the LRU isolation modes
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1575
   * @lru:	LRU list id for isolating
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1576
1577
1578
   *
   * returns how many pages were moved onto *@dst.
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1579
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1580
  		struct lruvec *lruvec, struct list_head *dst,
fe2c2a106   Rik van Riel   vmscan: reclaim a...
1581
  		unsigned long *nr_scanned, struct scan_control *sc,
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1582
  		isolate_mode_t mode, enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1583
  {
75b00af77   Hugh Dickins   mm: trivial clean...
1584
  	struct list_head *src = &lruvec->lists[lru];
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1585
  	unsigned long nr_taken = 0;
599d0c954   Mel Gorman   mm, vmscan: move ...
1586
  	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1587
  	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
3db65812d   Johannes Weiner   Revert "mm, vmsca...
1588
  	unsigned long skipped = 0;
791b48b64   Minchan Kim   mm: vmscan: scan ...
1589
  	unsigned long scan, total_scan, nr_pages;
b2e18757f   Mel Gorman   mm, vmscan: begin...
1590
  	LIST_HEAD(pages_skipped);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1591

791b48b64   Minchan Kim   mm: vmscan: scan ...
1592
1593
1594
1595
  	scan = 0;
  	for (total_scan = 0;
  	     scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
  	     total_scan++) {
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1596
  		struct page *page;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1597

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1598
1599
  		page = lru_to_page(src);
  		prefetchw_prev_lru_page(page, src, flags);
309381fea   Sasha Levin   mm: dump page whe...
1600
  		VM_BUG_ON_PAGE(!PageLRU(page), page);
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
1601

b2e18757f   Mel Gorman   mm, vmscan: begin...
1602
1603
  		if (page_zonenum(page) > sc->reclaim_idx) {
  			list_move(&page->lru, &pages_skipped);
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1604
  			nr_skipped[page_zonenum(page)]++;
b2e18757f   Mel Gorman   mm, vmscan: begin...
1605
1606
  			continue;
  		}
791b48b64   Minchan Kim   mm: vmscan: scan ...
1607
1608
1609
1610
1611
1612
1613
  		/*
  		 * Do not count skipped pages because that makes the function
  		 * return with no isolated pages if the LRU mostly contains
  		 * ineligible pages.  This causes the VM to not reclaim any
  		 * pages, triggering a premature OOM.
  		 */
  		scan++;
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1614
  		switch (__isolate_lru_page(page, mode)) {
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1615
  		case 0:
599d0c954   Mel Gorman   mm, vmscan: move ...
1616
1617
1618
  			nr_pages = hpage_nr_pages(page);
  			nr_taken += nr_pages;
  			nr_zone_taken[page_zonenum(page)] += nr_pages;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1619
  			list_move(&page->lru, dst);
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1620
1621
1622
1623
1624
1625
  			break;
  
  		case -EBUSY:
  			/* else it is being freed elsewhere */
  			list_move(&page->lru, src);
  			continue;
46453a6e1   Nick Piggin   [PATCH] mm: never...
1626

5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1627
1628
1629
  		default:
  			BUG();
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630
  	}
b2e18757f   Mel Gorman   mm, vmscan: begin...
1631
1632
1633
1634
1635
1636
1637
  	/*
  	 * Splice any skipped pages to the start of the LRU list. Note that
  	 * this disrupts the LRU order when reclaiming for lower zones but
  	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
  	 * scanning would soon rescan the same pages to skip and put the
  	 * system at risk of premature OOM.
  	 */
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1638
1639
  	if (!list_empty(&pages_skipped)) {
  		int zid;
3db65812d   Johannes Weiner   Revert "mm, vmsca...
1640
  		list_splice(&pages_skipped, src);
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1641
1642
1643
1644
1645
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			if (!nr_skipped[zid])
  				continue;
  
  			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1265e3a69   Michal Hocko   mm, vmscan: show ...
1646
  			skipped += nr_skipped[zid];
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1647
1648
  		}
  	}
791b48b64   Minchan Kim   mm: vmscan: scan ...
1649
  	*nr_scanned = total_scan;
1265e3a69   Michal Hocko   mm, vmscan: show ...
1650
  	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
791b48b64   Minchan Kim   mm: vmscan: scan ...
1651
  				    total_scan, skipped, nr_taken, mode, lru);
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1652
  	update_lru_sizes(lruvec, lru, nr_zone_taken);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1653
1654
  	return nr_taken;
  }
62695a84e   Nick Piggin   vmscan: move isol...
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
  /**
   * isolate_lru_page - tries to isolate a page from its LRU list
   * @page: page to isolate from its LRU list
   *
   * Isolates a @page from an LRU list, clears PageLRU and adjusts the
   * vmstat statistic corresponding to whatever LRU list the page was on.
   *
   * Returns 0 if the page was removed from an LRU list.
   * Returns -EBUSY if the page was not on an LRU list.
   *
   * The returned page will have PageLRU() cleared.  If it was found on
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1666
1667
1668
   * the active list, it will have PageActive set.  If it was found on
   * the unevictable list, it will have the PageUnevictable bit set. That flag
   * may need to be cleared by the caller before letting the page go.
62695a84e   Nick Piggin   vmscan: move isol...
1669
1670
1671
1672
1673
   *
   * The vmstat statistic corresponding to the list on which the page was
   * found will be decremented.
   *
   * Restrictions:
a5d09bed7   Mike Rapoport   mm: docs: add bla...
1674
   *
62695a84e   Nick Piggin   vmscan: move isol...
1675
1676
1677
1678
1679
1680
1681
1682
1683
   * (1) Must be called with an elevated refcount on the page. This is a
   *     fundamentnal difference from isolate_lru_pages (which is called
   *     without a stable reference).
   * (2) the lru_lock must not be held.
   * (3) interrupts must be enabled.
   */
  int isolate_lru_page(struct page *page)
  {
  	int ret = -EBUSY;
309381fea   Sasha Levin   mm: dump page whe...
1684
  	VM_BUG_ON_PAGE(!page_count(page), page);
cf2a82ee4   Kirill A. Shutemov   mm: downgrade VM_...
1685
  	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1686

62695a84e   Nick Piggin   vmscan: move isol...
1687
1688
  	if (PageLRU(page)) {
  		struct zone *zone = page_zone(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1689
  		struct lruvec *lruvec;
62695a84e   Nick Piggin   vmscan: move isol...
1690

a52633d8e   Mel Gorman   mm, vmscan: move ...
1691
  		spin_lock_irq(zone_lru_lock(zone));
599d0c954   Mel Gorman   mm, vmscan: move ...
1692
  		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1693
  		if (PageLRU(page)) {
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1694
  			int lru = page_lru(page);
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1695
  			get_page(page);
62695a84e   Nick Piggin   vmscan: move isol...
1696
  			ClearPageLRU(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1697
1698
  			del_page_from_lru_list(page, lruvec, lru);
  			ret = 0;
62695a84e   Nick Piggin   vmscan: move isol...
1699
  		}
a52633d8e   Mel Gorman   mm, vmscan: move ...
1700
  		spin_unlock_irq(zone_lru_lock(zone));
62695a84e   Nick Piggin   vmscan: move isol...
1701
1702
1703
  	}
  	return ret;
  }
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1704
  /*
d37dd5dcb   Fengguang Wu   vmscan: comment t...
1705
1706
1707
1708
1709
   * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
   * then get resheduled. When there are massive number of tasks doing page
   * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
   * the LRU list will go small and be scanned faster than necessary, leading to
   * unnecessary swapping, thrashing and OOM.
35cd78156   Rik van Riel   vmscan: throttle ...
1710
   */
599d0c954   Mel Gorman   mm, vmscan: move ...
1711
  static int too_many_isolated(struct pglist_data *pgdat, int file,
35cd78156   Rik van Riel   vmscan: throttle ...
1712
1713
1714
1715
1716
1717
  		struct scan_control *sc)
  {
  	unsigned long inactive, isolated;
  
  	if (current_is_kswapd())
  		return 0;
97c9341f7   Tejun Heo   mm: vmscan: disab...
1718
  	if (!sane_reclaim(sc))
35cd78156   Rik van Riel   vmscan: throttle ...
1719
1720
1721
  		return 0;
  
  	if (file) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1722
1723
  		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
  		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
35cd78156   Rik van Riel   vmscan: throttle ...
1724
  	} else {
599d0c954   Mel Gorman   mm, vmscan: move ...
1725
1726
  		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
  		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
35cd78156   Rik van Riel   vmscan: throttle ...
1727
  	}
3cf23841b   Fengguang Wu   mm/vmscan.c: avoi...
1728
1729
1730
1731
1732
  	/*
  	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
  	 * won't get blocked by normal direct-reclaimers, forming a circular
  	 * deadlock.
  	 */
d0164adc8   Mel Gorman   mm, page_alloc: d...
1733
  	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
3cf23841b   Fengguang Wu   mm/vmscan.c: avoi...
1734
  		inactive >>= 3;
35cd78156   Rik van Riel   vmscan: throttle ...
1735
1736
  	return isolated > inactive;
  }
666356297   Mel Gorman   vmscan: set up pa...
1737
  static noinline_for_stack void
75b00af77   Hugh Dickins   mm: trivial clean...
1738
  putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
666356297   Mel Gorman   vmscan: set up pa...
1739
  {
27ac81d85   Konstantin Khlebnikov   mm/vmscan: push l...
1740
  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
599d0c954   Mel Gorman   mm, vmscan: move ...
1741
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3f79768f2   Hugh Dickins   mm: rearrange put...
1742
  	LIST_HEAD(pages_to_free);
666356297   Mel Gorman   vmscan: set up pa...
1743

666356297   Mel Gorman   vmscan: set up pa...
1744
1745
1746
  	/*
  	 * Put back any unfreeable pages.
  	 */
666356297   Mel Gorman   vmscan: set up pa...
1747
  	while (!list_empty(page_list)) {
3f79768f2   Hugh Dickins   mm: rearrange put...
1748
  		struct page *page = lru_to_page(page_list);
666356297   Mel Gorman   vmscan: set up pa...
1749
  		int lru;
3f79768f2   Hugh Dickins   mm: rearrange put...
1750

309381fea   Sasha Levin   mm: dump page whe...
1751
  		VM_BUG_ON_PAGE(PageLRU(page), page);
666356297   Mel Gorman   vmscan: set up pa...
1752
  		list_del(&page->lru);
39b5f29ac   Hugh Dickins   mm: remove vma ar...
1753
  		if (unlikely(!page_evictable(page))) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1754
  			spin_unlock_irq(&pgdat->lru_lock);
666356297   Mel Gorman   vmscan: set up pa...
1755
  			putback_lru_page(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1756
  			spin_lock_irq(&pgdat->lru_lock);
666356297   Mel Gorman   vmscan: set up pa...
1757
1758
  			continue;
  		}
fa9add641   Hugh Dickins   mm/memcg: apply a...
1759

599d0c954   Mel Gorman   mm, vmscan: move ...
1760
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1761

7a608572a   Linus Torvalds   Revert "mm: batch...
1762
  		SetPageLRU(page);
666356297   Mel Gorman   vmscan: set up pa...
1763
  		lru = page_lru(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1764
  		add_page_to_lru_list(page, lruvec, lru);
666356297   Mel Gorman   vmscan: set up pa...
1765
1766
  		if (is_active_lru(lru)) {
  			int file = is_file_lru(lru);
9992af102   Rik van Riel   thp: scale nr_rot...
1767
1768
  			int numpages = hpage_nr_pages(page);
  			reclaim_stat->recent_rotated[file] += numpages;
666356297   Mel Gorman   vmscan: set up pa...
1769
  		}
2bcf88796   Hugh Dickins   mm: take pagevecs...
1770
1771
1772
  		if (put_page_testzero(page)) {
  			__ClearPageLRU(page);
  			__ClearPageActive(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1773
  			del_page_from_lru_list(page, lruvec, lru);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1774
1775
  
  			if (unlikely(PageCompound(page))) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1776
  				spin_unlock_irq(&pgdat->lru_lock);
747db954c   Johannes Weiner   mm: memcontrol: u...
1777
  				mem_cgroup_uncharge(page);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1778
  				(*get_compound_page_dtor(page))(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1779
  				spin_lock_irq(&pgdat->lru_lock);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1780
1781
  			} else
  				list_add(&page->lru, &pages_to_free);
666356297   Mel Gorman   vmscan: set up pa...
1782
1783
  		}
  	}
666356297   Mel Gorman   vmscan: set up pa...
1784

3f79768f2   Hugh Dickins   mm: rearrange put...
1785
1786
1787
1788
  	/*
  	 * To save our caller's stack, now use input list for pages to free.
  	 */
  	list_splice(&pages_to_free, page_list);
666356297   Mel Gorman   vmscan: set up pa...
1789
1790
1791
  }
  
  /*
399ba0b95   NeilBrown   mm/vmscan.c: avoi...
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
   * If a kernel thread (such as nfsd for loop-back mounts) services
   * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
   * In that case we should only throttle if the backing device it is
   * writing to is congested.  In other cases it is safe to throttle.
   */
  static int current_may_throttle(void)
  {
  	return !(current->flags & PF_LESS_THROTTLE) ||
  		current->backing_dev_info == NULL ||
  		bdi_write_congested(current->backing_dev_info);
  }
  
  /*
b2e18757f   Mel Gorman   mm, vmscan: begin...
1805
   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
1806
   * of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1807
   */
666356297   Mel Gorman   vmscan: set up pa...
1808
  static noinline_for_stack unsigned long
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1809
  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
1810
  		     struct scan_control *sc, enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1811
1812
  {
  	LIST_HEAD(page_list);
e247dbce5   KOSAKI Motohiro   vmscan: simplify ...
1813
  	unsigned long nr_scanned;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1814
  	unsigned long nr_reclaimed = 0;
e247dbce5   KOSAKI Motohiro   vmscan: simplify ...
1815
  	unsigned long nr_taken;
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1816
  	struct reclaim_stat stat = {};
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1817
  	isolate_mode_t isolate_mode = 0;
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1818
  	int file = is_file_lru(lru);
599d0c954   Mel Gorman   mm, vmscan: move ...
1819
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1820
  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
db73ee0d4   Michal Hocko   mm, vmscan: do no...
1821
  	bool stalled = false;
78dc583d3   KOSAKI Motohiro   vmscan: low order...
1822

599d0c954   Mel Gorman   mm, vmscan: move ...
1823
  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
db73ee0d4   Michal Hocko   mm, vmscan: do no...
1824
1825
1826
1827
1828
1829
  		if (stalled)
  			return 0;
  
  		/* wait a bit for the reclaimer. */
  		msleep(100);
  		stalled = true;
35cd78156   Rik van Riel   vmscan: throttle ...
1830
1831
1832
1833
1834
  
  		/* We are about to die and free our memory. Return now. */
  		if (fatal_signal_pending(current))
  			return SWAP_CLUSTER_MAX;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1835
  	lru_add_drain();
f80c06736   Minchan Kim   mm: zone_reclaim:...
1836
1837
  
  	if (!sc->may_unmap)
613172891   Hillf Danton   mm/vmscan.c: clea...
1838
  		isolate_mode |= ISOLATE_UNMAPPED;
f80c06736   Minchan Kim   mm: zone_reclaim:...
1839

599d0c954   Mel Gorman   mm, vmscan: move ...
1840
  	spin_lock_irq(&pgdat->lru_lock);
b35ea17b7   KOSAKI Motohiro   mm: shrink_inacti...
1841

5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1842
1843
  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
  				     &nr_scanned, sc, isolate_mode, lru);
95d918fc0   Konstantin Khlebnikov   mm/vmscan: remove...
1844

599d0c954   Mel Gorman   mm, vmscan: move ...
1845
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
9d5e6a9f2   Hugh Dickins   mm: update_lru_si...
1846
  	reclaim_stat->recent_scanned[file] += nr_taken;
95d918fc0   Konstantin Khlebnikov   mm/vmscan: remove...
1847

2262185c5   Roman Gushchin   mm: per-cgroup me...
1848
1849
  	if (current_is_kswapd()) {
  		if (global_reclaim(sc))
599d0c954   Mel Gorman   mm, vmscan: move ...
1850
  			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1851
1852
1853
1854
  		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
  				   nr_scanned);
  	} else {
  		if (global_reclaim(sc))
599d0c954   Mel Gorman   mm, vmscan: move ...
1855
  			__count_vm_events(PGSCAN_DIRECT, nr_scanned);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1856
1857
  		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
  				   nr_scanned);
e247dbce5   KOSAKI Motohiro   vmscan: simplify ...
1858
  	}
599d0c954   Mel Gorman   mm, vmscan: move ...
1859
  	spin_unlock_irq(&pgdat->lru_lock);
b35ea17b7   KOSAKI Motohiro   mm: shrink_inacti...
1860

d563c0501   Hillf Danton   vmscan: handle is...
1861
  	if (nr_taken == 0)
666356297   Mel Gorman   vmscan: set up pa...
1862
  		return 0;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1863

a128ca71f   Shaohua Li   mm: delete unnece...
1864
  	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
3c710c1ad   Michal Hocko   mm, vmscan: extra...
1865
  				&stat, false);
c661b078f   Andy Whitcroft   synchronous lumpy...
1866

599d0c954   Mel Gorman   mm, vmscan: move ...
1867
  	spin_lock_irq(&pgdat->lru_lock);
3f79768f2   Hugh Dickins   mm: rearrange put...
1868

2262185c5   Roman Gushchin   mm: per-cgroup me...
1869
1870
  	if (current_is_kswapd()) {
  		if (global_reclaim(sc))
599d0c954   Mel Gorman   mm, vmscan: move ...
1871
  			__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1872
1873
1874
1875
  		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
  				   nr_reclaimed);
  	} else {
  		if (global_reclaim(sc))
599d0c954   Mel Gorman   mm, vmscan: move ...
1876
  			__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1877
1878
  		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
  				   nr_reclaimed);
904249aa6   Ying Han   mm: fix up the vm...
1879
  	}
a74609faf   Nick Piggin   [PATCH] mm: page_...
1880

27ac81d85   Konstantin Khlebnikov   mm/vmscan: push l...
1881
  	putback_inactive_pages(lruvec, &page_list);
3f79768f2   Hugh Dickins   mm: rearrange put...
1882

599d0c954   Mel Gorman   mm, vmscan: move ...
1883
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
3f79768f2   Hugh Dickins   mm: rearrange put...
1884

599d0c954   Mel Gorman   mm, vmscan: move ...
1885
  	spin_unlock_irq(&pgdat->lru_lock);
3f79768f2   Hugh Dickins   mm: rearrange put...
1886

747db954c   Johannes Weiner   mm: memcontrol: u...
1887
  	mem_cgroup_uncharge_list(&page_list);
2d4894b5d   Mel Gorman   mm: remove cold p...
1888
  	free_unref_page_list(&page_list);
e11da5b4f   Mel Gorman   tracing, vmscan: ...
1889

92df3a723   Mel Gorman   mm: vmscan: throt...
1890
  	/*
1c610d5f9   Andrey Ryabinin   mm/vmscan: wake u...
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
  	 * If dirty pages are scanned that are not queued for IO, it
  	 * implies that flushers are not doing their job. This can
  	 * happen when memory pressure pushes dirty pages to the end of
  	 * the LRU before the dirty limits are breached and the dirty
  	 * data has expired. It can also happen when the proportion of
  	 * dirty pages grows not through writes but through memory
  	 * pressure reclaiming all the clean cache. And in some cases,
  	 * the flushers simply cannot keep up with the allocation
  	 * rate. Nudge the flusher threads in case they are asleep.
  	 */
  	if (stat.nr_unqueued_dirty == nr_taken)
  		wakeup_flusher_threads(WB_REASON_VMSCAN);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
1903
1904
1905
1906
1907
1908
1909
1910
  	sc->nr.dirty += stat.nr_dirty;
  	sc->nr.congested += stat.nr_congested;
  	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
  	sc->nr.writeback += stat.nr_writeback;
  	sc->nr.immediate += stat.nr_immediate;
  	sc->nr.taken += nr_taken;
  	if (file)
  		sc->nr.file_taken += nr_taken;
8e9502828   Mel Gorman   mm: vmscan: move ...
1911

599d0c954   Mel Gorman   mm, vmscan: move ...
1912
  	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
d51d1e645   Steven Rostedt   mm, vmscan, traci...
1913
  			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
05ff51376   Andrew Morton   [PATCH] vmscan re...
1914
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1915
  }
3bb1a852a   Martin Bligh   [PATCH] vmscan: F...
1916
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1917
1918
1919
1920
1921
1922
   * This moves pages from the active list to the inactive list.
   *
   * We move them the other way if the page is referenced by one or more
   * processes, from rmap.
   *
   * If the pages are mostly unmapped, the processing is fast and it is
a52633d8e   Mel Gorman   mm, vmscan: move ...
1923
   * appropriate to hold zone_lru_lock across the whole operation.  But if
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1924
   * the pages are mapped, the processing is slow (page_referenced()) so we
a52633d8e   Mel Gorman   mm, vmscan: move ...
1925
   * should drop zone_lru_lock around each page.  It's impossible to balance
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1926
1927
1928
1929
   * this, so instead we remove the pages from the LRU while processing them.
   * It is safe to rely on PG_active against the non-LRU pages in here because
   * nobody will play with that bit on a non-LRU page.
   *
0139aa7b7   Joonsoo Kim   mm: rename _count...
1930
   * The downside is that we have to touch page->_refcount against each page.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1931
   * But we had to alter page->flags anyway.
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1932
1933
   *
   * Returns the number of pages moved to the given lru.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1934
   */
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
1935

9d998b4f1   Michal Hocko   mm, vmscan: add a...
1936
  static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1937
  				     struct list_head *list,
2bcf88796   Hugh Dickins   mm: take pagevecs...
1938
  				     struct list_head *pages_to_free,
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1939
1940
  				     enum lru_list lru)
  {
599d0c954   Mel Gorman   mm, vmscan: move ...
1941
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1942
  	struct page *page;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1943
  	int nr_pages;
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1944
  	int nr_moved = 0;
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1945

3eb4140f0   Wu Fengguang   vmscan: merge dup...
1946
1947
  	while (!list_empty(list)) {
  		page = lru_to_page(list);
599d0c954   Mel Gorman   mm, vmscan: move ...
1948
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1949

309381fea   Sasha Levin   mm: dump page whe...
1950
  		VM_BUG_ON_PAGE(PageLRU(page), page);
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1951
  		SetPageLRU(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1952
  		nr_pages = hpage_nr_pages(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1953
  		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
925b7673c   Johannes Weiner   mm: make per-memc...
1954
  		list_move(&page->lru, &lruvec->lists[lru]);
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1955

2bcf88796   Hugh Dickins   mm: take pagevecs...
1956
1957
1958
  		if (put_page_testzero(page)) {
  			__ClearPageLRU(page);
  			__ClearPageActive(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1959
  			del_page_from_lru_list(page, lruvec, lru);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1960
1961
  
  			if (unlikely(PageCompound(page))) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1962
  				spin_unlock_irq(&pgdat->lru_lock);
747db954c   Johannes Weiner   mm: memcontrol: u...
1963
  				mem_cgroup_uncharge(page);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1964
  				(*get_compound_page_dtor(page))(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1965
  				spin_lock_irq(&pgdat->lru_lock);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1966
1967
  			} else
  				list_add(&page->lru, pages_to_free);
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1968
1969
  		} else {
  			nr_moved += nr_pages;
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1970
1971
  		}
  	}
9d5e6a9f2   Hugh Dickins   mm: update_lru_si...
1972

2262185c5   Roman Gushchin   mm: per-cgroup me...
1973
  	if (!is_active_lru(lru)) {
f0958906c   Michal Hocko   mm, vmscan: do no...
1974
  		__count_vm_events(PGDEACTIVATE, nr_moved);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1975
1976
1977
  		count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
  				   nr_moved);
  	}
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1978
1979
  
  	return nr_moved;
3eb4140f0   Wu Fengguang   vmscan: merge dup...
1980
  }
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
1981

f626012db   Hugh Dickins   mm: remove isolat...
1982
  static void shrink_active_list(unsigned long nr_to_scan,
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1983
  			       struct lruvec *lruvec,
f16015fbf   Johannes Weiner   mm: vmscan: disti...
1984
  			       struct scan_control *sc,
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
1985
  			       enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1986
  {
44c241f16   KOSAKI Motohiro   mm: rename pgmove...
1987
  	unsigned long nr_taken;
f626012db   Hugh Dickins   mm: remove isolat...
1988
  	unsigned long nr_scanned;
6fe6b7e35   Wu Fengguang   vmscan: report vm...
1989
  	unsigned long vm_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1990
  	LIST_HEAD(l_hold);	/* The pages which were snipped off */
8cab4754d   Wu Fengguang   vmscan: make mapp...
1991
  	LIST_HEAD(l_active);
b69408e88   Christoph Lameter   vmscan: Use an in...
1992
  	LIST_HEAD(l_inactive);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1993
  	struct page *page;
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1994
  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1995
1996
  	unsigned nr_deactivate, nr_activate;
  	unsigned nr_rotated = 0;
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1997
  	isolate_mode_t isolate_mode = 0;
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1998
  	int file = is_file_lru(lru);
599d0c954   Mel Gorman   mm, vmscan: move ...
1999
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2000
2001
  
  	lru_add_drain();
f80c06736   Minchan Kim   mm: zone_reclaim:...
2002
2003
  
  	if (!sc->may_unmap)
613172891   Hillf Danton   mm/vmscan.c: clea...
2004
  		isolate_mode |= ISOLATE_UNMAPPED;
f80c06736   Minchan Kim   mm: zone_reclaim:...
2005

599d0c954   Mel Gorman   mm, vmscan: move ...
2006
  	spin_lock_irq(&pgdat->lru_lock);
925b7673c   Johannes Weiner   mm: make per-memc...
2007

5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
2008
2009
  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
  				     &nr_scanned, sc, isolate_mode, lru);
89b5fae53   Johannes Weiner   mm: vmscan: disti...
2010

599d0c954   Mel Gorman   mm, vmscan: move ...
2011
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
b7c46d151   Johannes Weiner   mm: drop unneeded...
2012
  	reclaim_stat->recent_scanned[file] += nr_taken;
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2013

599d0c954   Mel Gorman   mm, vmscan: move ...
2014
  	__count_vm_events(PGREFILL, nr_scanned);
2262185c5   Roman Gushchin   mm: per-cgroup me...
2015
  	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
9d5e6a9f2   Hugh Dickins   mm: update_lru_si...
2016

599d0c954   Mel Gorman   mm, vmscan: move ...
2017
  	spin_unlock_irq(&pgdat->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2018

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2019
2020
2021
2022
  	while (!list_empty(&l_hold)) {
  		cond_resched();
  		page = lru_to_page(&l_hold);
  		list_del(&page->lru);
7e9cd4842   Rik van Riel   vmscan: fix pagec...
2023

39b5f29ac   Hugh Dickins   mm: remove vma ar...
2024
  		if (unlikely(!page_evictable(page))) {
894bc3104   Lee Schermerhorn   Unevictable LRU I...
2025
2026
2027
  			putback_lru_page(page);
  			continue;
  		}
cc715d99e   Mel Gorman   mm: vmscan: forci...
2028
2029
2030
2031
2032
2033
2034
  		if (unlikely(buffer_heads_over_limit)) {
  			if (page_has_private(page) && trylock_page(page)) {
  				if (page_has_private(page))
  					try_to_release_page(page, 0);
  				unlock_page(page);
  			}
  		}
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
2035
2036
  		if (page_referenced(page, 0, sc->target_mem_cgroup,
  				    &vm_flags)) {
9992af102   Rik van Riel   thp: scale nr_rot...
2037
  			nr_rotated += hpage_nr_pages(page);
8cab4754d   Wu Fengguang   vmscan: make mapp...
2038
2039
2040
2041
2042
2043
2044
2045
2046
  			/*
  			 * Identify referenced, file-backed active pages and
  			 * give them one more trip around the active list. So
  			 * that executable code get better chances to stay in
  			 * memory under moderate memory pressure.  Anon pages
  			 * are not likely to be evicted by use-once streaming
  			 * IO, plus JVM can create lots of anon VM_EXEC pages,
  			 * so we ignore them here.
  			 */
41e20983f   Wu Fengguang   vmscan: limit VM_...
2047
  			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
8cab4754d   Wu Fengguang   vmscan: make mapp...
2048
2049
2050
2051
  				list_add(&page->lru, &l_active);
  				continue;
  			}
  		}
7e9cd4842   Rik van Riel   vmscan: fix pagec...
2052

5205e56ee   KOSAKI Motohiro   vmscan: move Clea...
2053
  		ClearPageActive(page);	/* we are de-activating */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2054
2055
  		list_add(&page->lru, &l_inactive);
  	}
b555749aa   Andrew Morton   vmscan: shrink_ac...
2056
  	/*
8cab4754d   Wu Fengguang   vmscan: make mapp...
2057
  	 * Move pages back to the lru list.
b555749aa   Andrew Morton   vmscan: shrink_ac...
2058
  	 */
599d0c954   Mel Gorman   mm, vmscan: move ...
2059
  	spin_lock_irq(&pgdat->lru_lock);
4f98a2fee   Rik van Riel   vmscan: split LRU...
2060
  	/*
8cab4754d   Wu Fengguang   vmscan: make mapp...
2061
2062
2063
  	 * Count referenced pages from currently used mappings as rotated,
  	 * even though only some of them are actually re-activated.  This
  	 * helps balance scan pressure between file and anonymous pages in
7c0db9e91   Jerome Marchand   mm, vmscan: fix a...
2064
  	 * get_scan_count.
7e9cd4842   Rik van Riel   vmscan: fix pagec...
2065
  	 */
b7c46d151   Johannes Weiner   mm: drop unneeded...
2066
  	reclaim_stat->recent_rotated[file] += nr_rotated;
556adecba   Rik van Riel   vmscan: second ch...
2067

9d998b4f1   Michal Hocko   mm, vmscan: add a...
2068
2069
  	nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
  	nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
599d0c954   Mel Gorman   mm, vmscan: move ...
2070
2071
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
  	spin_unlock_irq(&pgdat->lru_lock);
2bcf88796   Hugh Dickins   mm: take pagevecs...
2072

747db954c   Johannes Weiner   mm: memcontrol: u...
2073
  	mem_cgroup_uncharge_list(&l_hold);
2d4894b5d   Mel Gorman   mm: remove cold p...
2074
  	free_unref_page_list(&l_hold);
9d998b4f1   Michal Hocko   mm, vmscan: add a...
2075
2076
  	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
  			nr_deactivate, nr_rotated, sc->priority, file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2077
  }
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2078
2079
2080
  /*
   * The inactive anon list should be small enough that the VM never has
   * to do too much work.
14797e236   KOSAKI Motohiro   memcg: add inacti...
2081
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2082
2083
2084
   * The inactive file list should be small enough to leave most memory
   * to the established workingset on the scan-resistant active list,
   * but large enough to avoid thrashing the aggregate readahead window.
56e49d218   Rik van Riel   vmscan: evict use...
2085
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2086
2087
   * Both inactive lists should also be large enough that each inactive
   * page has a chance to be referenced again before it is reclaimed.
56e49d218   Rik van Riel   vmscan: evict use...
2088
   *
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2089
2090
   * If that fails and refaulting is observed, the inactive list grows.
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2091
   * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
3a50d14d0   Andrey Ryabinin   mm: remove unused...
2092
   * on this LRU, maintained by the pageout code. An inactive_ratio
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2093
   * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
56e49d218   Rik van Riel   vmscan: evict use...
2094
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
   * total     target    max
   * memory    ratio     inactive
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
56e49d218   Rik van Riel   vmscan: evict use...
2105
   */
f8d1a3116   Mel Gorman   mm: consider whet...
2106
  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2107
2108
  				 struct mem_cgroup *memcg,
  				 struct scan_control *sc, bool actual_reclaim)
56e49d218   Rik van Riel   vmscan: evict use...
2109
  {
fd5388037   Michal Hocko   mm, vmscan: clean...
2110
  	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2111
2112
2113
2114
2115
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  	enum lru_list inactive_lru = file * LRU_FILE;
  	unsigned long inactive, active;
  	unsigned long inactive_ratio;
  	unsigned long refaults;
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2116
  	unsigned long gb;
e3790144c   Johannes Weiner   mm: refactor inac...
2117

59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2118
2119
2120
2121
2122
2123
  	/*
  	 * If we don't have swap space, anonymous page deactivation
  	 * is pointless.
  	 */
  	if (!file && !total_swap_pages)
  		return false;
56e49d218   Rik van Riel   vmscan: evict use...
2124

fd5388037   Michal Hocko   mm, vmscan: clean...
2125
2126
  	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
  	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
f8d1a3116   Mel Gorman   mm: consider whet...
2127

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2128
  	if (memcg)
ccda7f436   Johannes Weiner   mm: memcontrol: u...
2129
  		refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
b39415b27   Rik van Riel   vmscan: do not ev...
2130
  	else
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
  		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
  
  	/*
  	 * When refaults are being observed, it means a new workingset
  	 * is being established. Disable active list protection to get
  	 * rid of the stale workingset quickly.
  	 */
  	if (file && actual_reclaim && lruvec->refaults != refaults) {
  		inactive_ratio = 0;
  	} else {
  		gb = (inactive + active) >> (30 - PAGE_SHIFT);
  		if (gb)
  			inactive_ratio = int_sqrt(10 * gb);
  		else
  			inactive_ratio = 1;
  	}
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2147

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2148
2149
2150
2151
2152
  	if (actual_reclaim)
  		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
  			lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
  			lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
  			inactive_ratio, file);
fd5388037   Michal Hocko   mm, vmscan: clean...
2153

59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2154
  	return inactive * inactive_ratio < active;
b39415b27   Rik van Riel   vmscan: do not ev...
2155
  }
4f98a2fee   Rik van Riel   vmscan: split LRU...
2156
  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2157
2158
  				 struct lruvec *lruvec, struct mem_cgroup *memcg,
  				 struct scan_control *sc)
b69408e88   Christoph Lameter   vmscan: Use an in...
2159
  {
b39415b27   Rik van Riel   vmscan: do not ev...
2160
  	if (is_active_lru(lru)) {
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2161
2162
  		if (inactive_list_is_low(lruvec, is_file_lru(lru),
  					 memcg, sc, true))
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
2163
  			shrink_active_list(nr_to_scan, lruvec, sc, lru);
556adecba   Rik van Riel   vmscan: second ch...
2164
2165
  		return 0;
  	}
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
2166
  	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
4f98a2fee   Rik van Riel   vmscan: split LRU...
2167
  }
9a2651140   Johannes Weiner   mm: vmscan: clean...
2168
2169
2170
2171
2172
2173
  enum scan_balance {
  	SCAN_EQUAL,
  	SCAN_FRACT,
  	SCAN_ANON,
  	SCAN_FILE,
  };
4f98a2fee   Rik van Riel   vmscan: split LRU...
2174
2175
2176
2177
2178
2179
  /*
   * Determine how aggressively the anon and file LRU lists should be
   * scanned.  The relative value of each set of LRU lists is determined
   * by looking at the fraction of the pages scanned we did rotate back
   * onto the active list instead of evict.
   *
be7bd59db   Wanpeng Li   mm: fix page recl...
2180
2181
   * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
   * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
4f98a2fee   Rik van Riel   vmscan: split LRU...
2182
   */
333776785   Vladimir Davydov   mm: vmscan: pass ...
2183
  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2184
2185
  			   struct scan_control *sc, unsigned long *nr,
  			   unsigned long *lru_pages)
4f98a2fee   Rik van Riel   vmscan: split LRU...
2186
  {
333776785   Vladimir Davydov   mm: vmscan: pass ...
2187
  	int swappiness = mem_cgroup_swappiness(memcg);
9a2651140   Johannes Weiner   mm: vmscan: clean...
2188
2189
2190
  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
  	u64 fraction[2];
  	u64 denominator = 0;	/* gcc */
599d0c954   Mel Gorman   mm, vmscan: move ...
2191
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4f98a2fee   Rik van Riel   vmscan: split LRU...
2192
  	unsigned long anon_prio, file_prio;
9a2651140   Johannes Weiner   mm: vmscan: clean...
2193
  	enum scan_balance scan_balance;
0bf1457f0   Johannes Weiner   mm: vmscan: do no...
2194
  	unsigned long anon, file;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2195
  	unsigned long ap, fp;
4111304da   Hugh Dickins   mm: enum lru_list...
2196
  	enum lru_list lru;
76a33fc38   Shaohua Li   vmscan: prevent g...
2197
2198
  
  	/* If we have no swap space, do not bother scanning anon pages. */
d8b38438a   Vladimir Davydov   mm: vmscan: do no...
2199
  	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2200
  		scan_balance = SCAN_FILE;
76a33fc38   Shaohua Li   vmscan: prevent g...
2201
2202
  		goto out;
  	}
4f98a2fee   Rik van Riel   vmscan: split LRU...
2203

10316b313   Johannes Weiner   mm: vmscan: clari...
2204
2205
2206
2207
2208
2209
2210
  	/*
  	 * Global reclaim will swap to prevent OOM even with no
  	 * swappiness, but memcg users want to use this knob to
  	 * disable swapping for individual groups completely when
  	 * using the memory controller's swap limit feature would be
  	 * too expensive.
  	 */
02695175c   Johannes Weiner   mm: vmscan: move ...
2211
  	if (!global_reclaim(sc) && !swappiness) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2212
  		scan_balance = SCAN_FILE;
10316b313   Johannes Weiner   mm: vmscan: clari...
2213
2214
2215
2216
2217
2218
2219
2220
  		goto out;
  	}
  
  	/*
  	 * Do not apply any pressure balancing cleverness when the
  	 * system is close to OOM, scan both anon and file equally
  	 * (unless the swappiness setting disagrees with swapping).
  	 */
02695175c   Johannes Weiner   mm: vmscan: move ...
2221
  	if (!sc->priority && swappiness) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2222
  		scan_balance = SCAN_EQUAL;
10316b313   Johannes Weiner   mm: vmscan: clari...
2223
2224
  		goto out;
  	}
11d16c25b   Johannes Weiner   mm: vmscan: impro...
2225
  	/*
623762517   Johannes Weiner   revert "mm: vmsca...
2226
2227
2228
2229
2230
2231
2232
2233
2234
  	 * Prevent the reclaimer from falling into the cache trap: as
  	 * cache pages start out inactive, every cache fault will tip
  	 * the scan balance towards the file LRU.  And as the file LRU
  	 * shrinks, so does the window for rotation from references.
  	 * This means we have a runaway feedback loop where a tiny
  	 * thrashing file LRU becomes infinitely more attractive than
  	 * anon pages.  Try to detect this based on file LRU size.
  	 */
  	if (global_reclaim(sc)) {
599d0c954   Mel Gorman   mm, vmscan: move ...
2235
2236
2237
2238
  		unsigned long pgdatfile;
  		unsigned long pgdatfree;
  		int z;
  		unsigned long total_high_wmark = 0;
2ab051e11   Jerome Marchand   memcg, vmscan: Fi...
2239

599d0c954   Mel Gorman   mm, vmscan: move ...
2240
2241
2242
2243
2244
2245
  		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
  		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
  			   node_page_state(pgdat, NR_INACTIVE_FILE);
  
  		for (z = 0; z < MAX_NR_ZONES; z++) {
  			struct zone *zone = &pgdat->node_zones[z];
6aa303def   Mel Gorman   mm, vmscan: only ...
2246
  			if (!managed_zone(zone))
599d0c954   Mel Gorman   mm, vmscan: move ...
2247
2248
2249
2250
  				continue;
  
  			total_high_wmark += high_wmark_pages(zone);
  		}
623762517   Johannes Weiner   revert "mm: vmsca...
2251

599d0c954   Mel Gorman   mm, vmscan: move ...
2252
  		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
062262267   David Rientjes   mm, vmscan: avoid...
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
  			/*
  			 * Force SCAN_ANON if there are enough inactive
  			 * anonymous pages on the LRU in eligible zones.
  			 * Otherwise, the small LRU gets thrashed.
  			 */
  			if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
  			    lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
  					>> sc->priority) {
  				scan_balance = SCAN_ANON;
  				goto out;
  			}
623762517   Johannes Weiner   revert "mm: vmsca...
2264
2265
2266
2267
  		}
  	}
  
  	/*
316bda0e6   Vladimir Davydov   vmscan: do not fo...
2268
2269
2270
2271
2272
2273
2274
  	 * If there is enough inactive page cache, i.e. if the size of the
  	 * inactive list is greater than that of the active list *and* the
  	 * inactive list actually has some pages to scan on this priority, we
  	 * do not reclaim anything from the anonymous working set right now.
  	 * Without the second condition we could end up never scanning an
  	 * lruvec even if it has plenty of old anonymous pages unless the
  	 * system is under heavy pressure.
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2275
  	 */
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2276
  	if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
71ab6cfe8   Michal Hocko   mm, vmscan: consi...
2277
  	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2278
  		scan_balance = SCAN_FILE;
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2279
2280
  		goto out;
  	}
9a2651140   Johannes Weiner   mm: vmscan: clean...
2281
  	scan_balance = SCAN_FRACT;
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2282
  	/*
58c37f6e0   KOSAKI Motohiro   vmscan: protect r...
2283
2284
2285
  	 * With swappiness at 100, anonymous and file have the same priority.
  	 * This scanning priority is essentially the inverse of IO cost.
  	 */
02695175c   Johannes Weiner   mm: vmscan: move ...
2286
  	anon_prio = swappiness;
75b00af77   Hugh Dickins   mm: trivial clean...
2287
  	file_prio = 200 - anon_prio;
58c37f6e0   KOSAKI Motohiro   vmscan: protect r...
2288
2289
  
  	/*
4f98a2fee   Rik van Riel   vmscan: split LRU...
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
  	 * OK, so we have swap space and a fair amount of page cache
  	 * pages.  We use the recently rotated / recently scanned
  	 * ratios to determine how valuable each cache is.
  	 *
  	 * Because workloads change over time (and to avoid overflow)
  	 * we keep these statistics as a floating average, which ends
  	 * up weighing recent references more than old ones.
  	 *
  	 * anon in [0], file in [1]
  	 */
2ab051e11   Jerome Marchand   memcg, vmscan: Fi...
2300

fd5388037   Michal Hocko   mm, vmscan: clean...
2301
2302
2303
2304
  	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
  		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
  	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
  		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2ab051e11   Jerome Marchand   memcg, vmscan: Fi...
2305

599d0c954   Mel Gorman   mm, vmscan: move ...
2306
  	spin_lock_irq(&pgdat->lru_lock);
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2307
  	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2308
2309
  		reclaim_stat->recent_scanned[0] /= 2;
  		reclaim_stat->recent_rotated[0] /= 2;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2310
  	}
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2311
  	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2312
2313
  		reclaim_stat->recent_scanned[1] /= 2;
  		reclaim_stat->recent_rotated[1] /= 2;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2314
2315
2316
  	}
  
  	/*
00d8089c5   Rik van Riel   vmscan: fix get_s...
2317
2318
2319
  	 * The amount of pressure on anon vs file pages is inversely
  	 * proportional to the fraction of recently scanned pages on
  	 * each list that were recently referenced and in active use.
4f98a2fee   Rik van Riel   vmscan: split LRU...
2320
  	 */
fe35004fb   Satoru Moriya   mm: avoid swappin...
2321
  	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2322
  	ap /= reclaim_stat->recent_rotated[0] + 1;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2323

fe35004fb   Satoru Moriya   mm: avoid swappin...
2324
  	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
6e9015716   KOSAKI Motohiro   mm: introduce zon...
2325
  	fp /= reclaim_stat->recent_rotated[1] + 1;
599d0c954   Mel Gorman   mm, vmscan: move ...
2326
  	spin_unlock_irq(&pgdat->lru_lock);
4f98a2fee   Rik van Riel   vmscan: split LRU...
2327

76a33fc38   Shaohua Li   vmscan: prevent g...
2328
2329
2330
2331
  	fraction[0] = ap;
  	fraction[1] = fp;
  	denominator = ap + fp + 1;
  out:
688035f72   Johannes Weiner   mm: don't avoid h...
2332
2333
2334
2335
2336
  	*lru_pages = 0;
  	for_each_evictable_lru(lru) {
  		int file = is_file_lru(lru);
  		unsigned long size;
  		unsigned long scan;
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2337

688035f72   Johannes Weiner   mm: don't avoid h...
2338
2339
2340
2341
2342
2343
2344
2345
  		size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
  		scan = size >> sc->priority;
  		/*
  		 * If the cgroup's already been deleted, make sure to
  		 * scrape out the remaining cache.
  		 */
  		if (!scan && !mem_cgroup_online(memcg))
  			scan = min(size, SWAP_CLUSTER_MAX);
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2346

688035f72   Johannes Weiner   mm: don't avoid h...
2347
2348
2349
2350
2351
  		switch (scan_balance) {
  		case SCAN_EQUAL:
  			/* Scan lists relative to size */
  			break;
  		case SCAN_FRACT:
9a2651140   Johannes Weiner   mm: vmscan: clean...
2352
  			/*
688035f72   Johannes Weiner   mm: don't avoid h...
2353
2354
  			 * Scan types proportional to swappiness and
  			 * their relative recent reclaim efficiency.
a5e880969   Roman Gushchin   mm: don't miss th...
2355
2356
  			 * Make sure we don't miss the last page
  			 * because of a round-off error.
9a2651140   Johannes Weiner   mm: vmscan: clean...
2357
  			 */
a5e880969   Roman Gushchin   mm: don't miss th...
2358
2359
  			scan = DIV64_U64_ROUND_UP(scan * fraction[file],
  						  denominator);
688035f72   Johannes Weiner   mm: don't avoid h...
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
  			break;
  		case SCAN_FILE:
  		case SCAN_ANON:
  			/* Scan one type exclusively */
  			if ((scan_balance == SCAN_FILE) != file) {
  				size = 0;
  				scan = 0;
  			}
  			break;
  		default:
  			/* Look ma, no brain */
  			BUG();
9a2651140   Johannes Weiner   mm: vmscan: clean...
2372
  		}
688035f72   Johannes Weiner   mm: don't avoid h...
2373
2374
2375
  
  		*lru_pages += size;
  		nr[lru] = scan;
76a33fc38   Shaohua Li   vmscan: prevent g...
2376
  	}
6e08a369e   Wu Fengguang   vmscan: cleanup t...
2377
  }
4f98a2fee   Rik van Riel   vmscan: split LRU...
2378

9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2379
  /*
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2380
   * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2381
   */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2382
  static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
333776785   Vladimir Davydov   mm: vmscan: pass ...
2383
  			      struct scan_control *sc, unsigned long *lru_pages)
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2384
  {
ef8f23279   Mel Gorman   mm, memcg: move m...
2385
  	struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2386
  	unsigned long nr[NR_LRU_LISTS];
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2387
  	unsigned long targets[NR_LRU_LISTS];
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2388
2389
2390
2391
2392
  	unsigned long nr_to_scan;
  	enum lru_list lru;
  	unsigned long nr_reclaimed = 0;
  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
  	struct blk_plug plug;
1a501907b   Mel Gorman   mm: vmscan: use p...
2393
  	bool scan_adjusted;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2394

333776785   Vladimir Davydov   mm: vmscan: pass ...
2395
  	get_scan_count(lruvec, memcg, sc, nr, lru_pages);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2396

e82e0561d   Mel Gorman   mm: vmscan: obey ...
2397
2398
  	/* Record the original scan target for proportional adjustments later */
  	memcpy(targets, nr, sizeof(nr));
1a501907b   Mel Gorman   mm: vmscan: use p...
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
  	/*
  	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
  	 * event that can occur when there is little memory pressure e.g.
  	 * multiple streaming readers/writers. Hence, we do not abort scanning
  	 * when the requested number of pages are reclaimed when scanning at
  	 * DEF_PRIORITY on the assumption that the fact we are direct
  	 * reclaiming implies that kswapd is not keeping up and it is best to
  	 * do a batch of work at once. For memcg reclaim one check is made to
  	 * abort proportional reclaim if either the file or anon lru has already
  	 * dropped to zero at the first pass.
  	 */
  	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
  			 sc->priority == DEF_PRIORITY);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2412
2413
2414
  	blk_start_plug(&plug);
  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
  					nr[LRU_INACTIVE_FILE]) {
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2415
2416
  		unsigned long nr_anon, nr_file, percentage;
  		unsigned long nr_scanned;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2417
2418
2419
2420
2421
2422
  		for_each_evictable_lru(lru) {
  			if (nr[lru]) {
  				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
  				nr[lru] -= nr_to_scan;
  
  				nr_reclaimed += shrink_list(lru, nr_to_scan,
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2423
  							    lruvec, memcg, sc);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2424
2425
  			}
  		}
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2426

bd041733c   Michal Hocko   mm, vmscan: add c...
2427
  		cond_resched();
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2428
2429
  		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
  			continue;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2430
  		/*
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2431
  		 * For kswapd and memcg, reclaim at least the number of pages
1a501907b   Mel Gorman   mm: vmscan: use p...
2432
  		 * requested. Ensure that the anon and file LRUs are scanned
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2433
2434
2435
2436
2437
2438
  		 * proportionally what was requested by get_scan_count(). We
  		 * stop reclaiming one LRU and reduce the amount scanning
  		 * proportional to the original scan target.
  		 */
  		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
  		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
1a501907b   Mel Gorman   mm: vmscan: use p...
2439
2440
2441
2442
2443
2444
2445
2446
  		/*
  		 * It's just vindictive to attack the larger once the smaller
  		 * has gone to zero.  And given the way we stop scanning the
  		 * smaller below, this makes sure that we only make one nudge
  		 * towards proportionality once we've got nr_to_reclaim.
  		 */
  		if (!nr_file || !nr_anon)
  			break;
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
  		if (nr_file > nr_anon) {
  			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
  						targets[LRU_ACTIVE_ANON] + 1;
  			lru = LRU_BASE;
  			percentage = nr_anon * 100 / scan_target;
  		} else {
  			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
  						targets[LRU_ACTIVE_FILE] + 1;
  			lru = LRU_FILE;
  			percentage = nr_file * 100 / scan_target;
  		}
  
  		/* Stop scanning the smaller of the LRU */
  		nr[lru] = 0;
  		nr[lru + LRU_ACTIVE] = 0;
  
  		/*
  		 * Recalculate the other LRU scan count based on its original
  		 * scan target and the percentage scanning already complete
  		 */
  		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
  		nr_scanned = targets[lru] - nr[lru];
  		nr[lru] = targets[lru] * (100 - percentage) / 100;
  		nr[lru] -= min(nr[lru], nr_scanned);
  
  		lru += LRU_ACTIVE;
  		nr_scanned = targets[lru] - nr[lru];
  		nr[lru] = targets[lru] * (100 - percentage) / 100;
  		nr[lru] -= min(nr[lru], nr_scanned);
  
  		scan_adjusted = true;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2478
2479
2480
2481
2482
2483
2484
2485
  	}
  	blk_finish_plug(&plug);
  	sc->nr_reclaimed += nr_reclaimed;
  
  	/*
  	 * Even if we did not try to evict anon pages at all, we want to
  	 * rebalance the anon lru active/inactive ratio.
  	 */
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2486
  	if (inactive_list_is_low(lruvec, false, memcg, sc, true))
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2487
2488
  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  				   sc, LRU_ACTIVE_ANON);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2489
  }
23b9da55c   Mel Gorman   mm: vmscan: remov...
2490
  /* Use reclaim/compaction for costly allocs or under memory pressure */
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2491
  static bool in_reclaim_compaction(struct scan_control *sc)
23b9da55c   Mel Gorman   mm: vmscan: remov...
2492
  {
d84da3f9e   Kirill A. Shutemov   mm: use IS_ENABLE...
2493
  	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
23b9da55c   Mel Gorman   mm: vmscan: remov...
2494
  			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2495
  			 sc->priority < DEF_PRIORITY - 2))
23b9da55c   Mel Gorman   mm: vmscan: remov...
2496
2497
2498
2499
  		return true;
  
  	return false;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2500
  /*
23b9da55c   Mel Gorman   mm: vmscan: remov...
2501
2502
2503
2504
2505
   * Reclaim/compaction is used for high-order allocation requests. It reclaims
   * order-0 pages before compacting the zone. should_continue_reclaim() returns
   * true if more pages should be reclaimed such that when the page allocator
   * calls try_to_compact_zone() that it will have enough free pages to succeed.
   * It will give up earlier than that if there is difficulty reclaiming pages.
3e7d34497   Mel Gorman   mm: vmscan: recla...
2506
   */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2507
  static inline bool should_continue_reclaim(struct pglist_data *pgdat,
3e7d34497   Mel Gorman   mm: vmscan: recla...
2508
2509
2510
2511
2512
2513
  					unsigned long nr_reclaimed,
  					unsigned long nr_scanned,
  					struct scan_control *sc)
  {
  	unsigned long pages_for_compaction;
  	unsigned long inactive_lru_pages;
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2514
  	int z;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2515
2516
  
  	/* If not in reclaim/compaction mode, stop */
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2517
  	if (!in_reclaim_compaction(sc))
3e7d34497   Mel Gorman   mm: vmscan: recla...
2518
  		return false;
2876592f2   Mel Gorman   mm: vmscan: stop ...
2519
  	/* Consider stopping depending on scan and reclaim activity */
dcda9b047   Michal Hocko   mm, tree wide: re...
2520
  	if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2876592f2   Mel Gorman   mm: vmscan: stop ...
2521
  		/*
dcda9b047   Michal Hocko   mm, tree wide: re...
2522
  		 * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
2876592f2   Mel Gorman   mm: vmscan: stop ...
2523
2524
  		 * full LRU list has been scanned and we are still failing
  		 * to reclaim pages. This full LRU scan is potentially
dcda9b047   Michal Hocko   mm, tree wide: re...
2525
  		 * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
2876592f2   Mel Gorman   mm: vmscan: stop ...
2526
2527
2528
2529
2530
  		 */
  		if (!nr_reclaimed && !nr_scanned)
  			return false;
  	} else {
  		/*
dcda9b047   Michal Hocko   mm, tree wide: re...
2531
  		 * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
2876592f2   Mel Gorman   mm: vmscan: stop ...
2532
2533
2534
2535
2536
2537
2538
2539
2540
  		 * fail without consequence, stop if we failed to reclaim
  		 * any pages from the last SWAP_CLUSTER_MAX number of
  		 * pages that were scanned. This will return to the
  		 * caller faster at the risk reclaim/compaction and
  		 * the resulting allocation attempt fails
  		 */
  		if (!nr_reclaimed)
  			return false;
  	}
3e7d34497   Mel Gorman   mm: vmscan: recla...
2541
2542
2543
2544
2545
  
  	/*
  	 * If we have not reclaimed enough pages for compaction and the
  	 * inactive lists are large enough, continue reclaiming
  	 */
9861a62c3   Vlastimil Babka   mm, compaction: c...
2546
  	pages_for_compaction = compact_gap(sc->order);
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2547
  	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
ec8acf20a   Shaohua Li   swap: add per-par...
2548
  	if (get_nr_swap_pages() > 0)
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2549
  		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
3e7d34497   Mel Gorman   mm: vmscan: recla...
2550
2551
2552
2553
2554
  	if (sc->nr_reclaimed < pages_for_compaction &&
  			inactive_lru_pages > pages_for_compaction)
  		return true;
  
  	/* If compaction would go ahead or the allocation would succeed, stop */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2555
2556
  	for (z = 0; z <= sc->reclaim_idx; z++) {
  		struct zone *zone = &pgdat->node_zones[z];
6aa303def   Mel Gorman   mm, vmscan: only ...
2557
  		if (!managed_zone(zone))
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2558
2559
2560
  			continue;
  
  		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
cf378319d   Vlastimil Babka   mm, compaction: r...
2561
  		case COMPACT_SUCCESS:
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2562
2563
2564
2565
2566
2567
  		case COMPACT_CONTINUE:
  			return false;
  		default:
  			/* check next zone */
  			;
  		}
3e7d34497   Mel Gorman   mm: vmscan: recla...
2568
  	}
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2569
  	return true;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2570
  }
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2571
2572
2573
2574
2575
  static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
  {
  	return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
  		(memcg && memcg_congested(pgdat, memcg));
  }
970a39a36   Mel Gorman   mm, vmscan: avoid...
2576
  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2577
  {
cb731d6c6   Vladimir Davydov   vmscan: per memor...
2578
  	struct reclaim_state *reclaim_state = current->reclaim_state;
f0fdc5e8e   Johannes Weiner   vmscan: fix zone ...
2579
  	unsigned long nr_reclaimed, nr_scanned;
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2580
  	bool reclaimable = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581

9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2582
2583
2584
  	do {
  		struct mem_cgroup *root = sc->target_mem_cgroup;
  		struct mem_cgroup_reclaim_cookie reclaim = {
ef8f23279   Mel Gorman   mm, memcg: move m...
2585
  			.pgdat = pgdat,
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2586
2587
  			.priority = sc->priority,
  		};
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2588
  		unsigned long node_lru_pages = 0;
694fbc0fe   Andrew Morton   revert "memcg: en...
2589
  		struct mem_cgroup *memcg;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2590

d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2591
  		memset(&sc->nr, 0, sizeof(sc->nr));
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2592
2593
  		nr_reclaimed = sc->nr_reclaimed;
  		nr_scanned = sc->nr_scanned;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2594

694fbc0fe   Andrew Morton   revert "memcg: en...
2595
2596
  		memcg = mem_cgroup_iter(root, NULL, &reclaim);
  		do {
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2597
  			unsigned long lru_pages;
8e8ae6452   Johannes Weiner   mm: memcontrol: h...
2598
  			unsigned long reclaimed;
cb731d6c6   Vladimir Davydov   vmscan: per memor...
2599
  			unsigned long scanned;
5660048cc   Johannes Weiner   mm: move memcg hi...
2600

bf8d5d52f   Roman Gushchin   memcg: introduce ...
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
  			switch (mem_cgroup_protected(root, memcg)) {
  			case MEMCG_PROT_MIN:
  				/*
  				 * Hard protection.
  				 * If there is no reclaimable memory, OOM.
  				 */
  				continue;
  			case MEMCG_PROT_LOW:
  				/*
  				 * Soft protection.
  				 * Respect the protection only as long as
  				 * there is an unprotected supply
  				 * of reclaimable memory from other cgroups.
  				 */
d6622f636   Yisheng Xie   mm/vmscan: more r...
2615
2616
  				if (!sc->memcg_low_reclaim) {
  					sc->memcg_low_skipped = 1;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2617
  					continue;
d6622f636   Yisheng Xie   mm/vmscan: more r...
2618
  				}
e27be240d   Johannes Weiner   mm: memcg: make s...
2619
  				memcg_memory_event(memcg, MEMCG_LOW);
bf8d5d52f   Roman Gushchin   memcg: introduce ...
2620
2621
2622
  				break;
  			case MEMCG_PROT_NONE:
  				break;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2623
  			}
8e8ae6452   Johannes Weiner   mm: memcontrol: h...
2624
  			reclaimed = sc->nr_reclaimed;
cb731d6c6   Vladimir Davydov   vmscan: per memor...
2625
  			scanned = sc->nr_scanned;
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2626
2627
  			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
  			node_lru_pages += lru_pages;
f16015fbf   Johannes Weiner   mm: vmscan: disti...
2628

aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
2629
2630
  			shrink_slab(sc->gfp_mask, pgdat->node_id,
  				    memcg, sc->priority);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
2631

8e8ae6452   Johannes Weiner   mm: memcontrol: h...
2632
2633
2634
2635
  			/* Record the group's reclaim efficiency */
  			vmpressure(sc->gfp_mask, memcg, false,
  				   sc->nr_scanned - scanned,
  				   sc->nr_reclaimed - reclaimed);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2636
  			/*
a394cb8ee   Michal Hocko   memcg,vmscan: do ...
2637
2638
  			 * Direct reclaim and kswapd have to scan all memory
  			 * cgroups to fulfill the overall scan target for the
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2639
  			 * node.
a394cb8ee   Michal Hocko   memcg,vmscan: do ...
2640
2641
2642
2643
2644
  			 *
  			 * Limit reclaim, on the other hand, only cares about
  			 * nr_to_reclaim pages to be reclaimed and it will
  			 * retry with decreasing priority if one round over the
  			 * whole hierarchy is not sufficient.
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2645
  			 */
a394cb8ee   Michal Hocko   memcg,vmscan: do ...
2646
2647
  			if (!global_reclaim(sc) &&
  					sc->nr_reclaimed >= sc->nr_to_reclaim) {
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2648
2649
2650
  				mem_cgroup_iter_break(root, memcg);
  				break;
  			}
241994ed8   Johannes Weiner   mm: memcontrol: d...
2651
  		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
70ddf637e   Anton Vorontsov   memcg: add memory...
2652

cb731d6c6   Vladimir Davydov   vmscan: per memor...
2653
2654
2655
  		if (reclaim_state) {
  			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
  			reclaim_state->reclaimed_slab = 0;
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2656
  		}
8e8ae6452   Johannes Weiner   mm: memcontrol: h...
2657
2658
  		/* Record the subtree's reclaim efficiency */
  		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
70ddf637e   Anton Vorontsov   memcg: add memory...
2659
2660
  			   sc->nr_scanned - nr_scanned,
  			   sc->nr_reclaimed - nr_reclaimed);
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2661
2662
  		if (sc->nr_reclaimed - nr_reclaimed)
  			reclaimable = true;
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
  		if (current_is_kswapd()) {
  			/*
  			 * If reclaim is isolating dirty pages under writeback,
  			 * it implies that the long-lived page allocation rate
  			 * is exceeding the page laundering rate. Either the
  			 * global limits are not being effective at throttling
  			 * processes due to the page distribution throughout
  			 * zones or there is heavy usage of a slow backing
  			 * device. The only option is to throttle from reclaim
  			 * context which is not ideal as there is no guarantee
  			 * the dirtying process is throttled in the same way
  			 * balance_dirty_pages() manages.
  			 *
  			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
  			 * count the number of pages under pages flagged for
  			 * immediate reclaim and stall if any are encountered
  			 * in the nr_immediate check below.
  			 */
  			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
  				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2683

d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
  			/*
  			 * Tag a node as congested if all the dirty pages
  			 * scanned were backed by a congested BDI and
  			 * wait_iff_congested will stall.
  			 */
  			if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  				set_bit(PGDAT_CONGESTED, &pgdat->flags);
  
  			/* Allow kswapd to start writing pages during reclaim.*/
  			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
  				set_bit(PGDAT_DIRTY, &pgdat->flags);
  
  			/*
  			 * If kswapd scans pages marked marked for immediate
  			 * reclaim and under writeback (nr_immediate), it
  			 * implies that pages are cycling through the LRU
  			 * faster than they are written so also forcibly stall.
  			 */
  			if (sc->nr.immediate)
  				congestion_wait(BLK_RW_ASYNC, HZ/10);
  		}
  
  		/*
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2707
2708
2709
2710
2711
2712
2713
2714
  		 * Legacy memcg will stall in page writeback so avoid forcibly
  		 * stalling in wait_iff_congested().
  		 */
  		if (!global_reclaim(sc) && sane_reclaim(sc) &&
  		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  			set_memcg_congestion(pgdat, root, true);
  
  		/*
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2715
2716
2717
2718
2719
2720
  		 * Stall direct reclaim for IO completions if underlying BDIs
  		 * and node is congested. Allow kswapd to continue until it
  		 * starts encountering unqueued dirty pages or cycling through
  		 * the LRU too quickly.
  		 */
  		if (!sc->hibernation_mode && !current_is_kswapd() &&
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2721
2722
  		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
  			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2723

a9dd0a831   Mel Gorman   mm, vmscan: make ...
2724
  	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2725
  					 sc->nr_scanned - nr_scanned, sc));
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2726

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2727
2728
2729
2730
2731
2732
2733
2734
  	/*
  	 * Kswapd gives up on balancing particular nodes after too
  	 * many failures to reclaim anything from them and goes to
  	 * sleep. On reclaim progress, reset the failure counter. A
  	 * successful direct reclaim run will revive a dormant kswapd.
  	 */
  	if (reclaimable)
  		pgdat->kswapd_failures = 0;
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2735
  	return reclaimable;
f16015fbf   Johannes Weiner   mm: vmscan: disti...
2736
  }
53853e2d2   Vlastimil Babka   mm, compaction: d...
2737
  /*
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2738
2739
2740
   * Returns true if compaction should go ahead for a costly-order request, or
   * the allocation would already succeed without compaction. Return false if we
   * should reclaim first.
53853e2d2   Vlastimil Babka   mm, compaction: d...
2741
   */
4f588331b   Mel Gorman   mm, vmscan: avoid...
2742
  static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2743
  {
31483b6ad   Mel Gorman   mm, vmscan: remov...
2744
  	unsigned long watermark;
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2745
  	enum compact_result suitable;
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2746

fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2747
2748
2749
2750
2751
2752
2753
  	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
  	if (suitable == COMPACT_SUCCESS)
  		/* Allocation should succeed already. Don't reclaim. */
  		return true;
  	if (suitable == COMPACT_SKIPPED)
  		/* Compaction cannot yet proceed. Do reclaim. */
  		return false;
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2754

53853e2d2   Vlastimil Babka   mm, compaction: d...
2755
  	/*
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2756
2757
2758
2759
2760
2761
2762
  	 * Compaction is already possible, but it takes time to run and there
  	 * are potentially other callers using the pages just freed. So proceed
  	 * with reclaim to make a buffer of free pages available to give
  	 * compaction a reasonable chance of completing and allocating the page.
  	 * Note that we won't actually reclaim the whole buffer in one attempt
  	 * as the target watermark in should_continue_reclaim() is lower. But if
  	 * we are already above the high+gap watermark, don't reclaim at all.
53853e2d2   Vlastimil Babka   mm, compaction: d...
2763
  	 */
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2764
  	watermark = high_wmark_pages(zone) + compact_gap(sc->order);
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2765

fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2766
  	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2767
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2768
2769
2770
2771
2772
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
   * request.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2773
2774
2775
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
   */
0a0337e0d   Michal Hocko   mm, oom: rework o...
2776
  static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2777
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2778
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2779
  	struct zone *zone;
0608f43da   Andrew Morton   revert "memcg, vm...
2780
2781
  	unsigned long nr_soft_reclaimed;
  	unsigned long nr_soft_scanned;
619d0d76c   Weijie Yang   mm/vmscan: restor...
2782
  	gfp_t orig_mask;
79dafcdca   Mel Gorman   mm, vmscan: by de...
2783
  	pg_data_t *last_pgdat = NULL;
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2784

cc715d99e   Mel Gorman   mm: vmscan: forci...
2785
2786
2787
2788
2789
  	/*
  	 * If the number of buffer_heads in the machine exceeds the maximum
  	 * allowed level, force direct reclaim to scan the highmem zone as
  	 * highmem pages could be pinning lowmem pages storing buffer_heads
  	 */
619d0d76c   Weijie Yang   mm/vmscan: restor...
2790
  	orig_mask = sc->gfp_mask;
b2e18757f   Mel Gorman   mm, vmscan: begin...
2791
  	if (buffer_heads_over_limit) {
cc715d99e   Mel Gorman   mm: vmscan: forci...
2792
  		sc->gfp_mask |= __GFP_HIGHMEM;
4f588331b   Mel Gorman   mm, vmscan: avoid...
2793
  		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
b2e18757f   Mel Gorman   mm, vmscan: begin...
2794
  	}
cc715d99e   Mel Gorman   mm: vmscan: forci...
2795

d4debc66d   Mel Gorman   vmscan: remove un...
2796
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
b2e18757f   Mel Gorman   mm, vmscan: begin...
2797
  					sc->reclaim_idx, sc->nodemask) {
b2e18757f   Mel Gorman   mm, vmscan: begin...
2798
  		/*
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2799
2800
2801
  		 * Take care memory controller reclaiming has small influence
  		 * to global LRU.
  		 */
89b5fae53   Johannes Weiner   mm: vmscan: disti...
2802
  		if (global_reclaim(sc)) {
344736f29   Vladimir Davydov   cpuset: simplify ...
2803
2804
  			if (!cpuset_zone_allowed(zone,
  						 GFP_KERNEL | __GFP_HARDWALL))
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2805
  				continue;
65ec02cb9   Vladimir Davydov   mm: vmscan: move ...
2806

0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
  			/*
  			 * If we already have plenty of memory free for
  			 * compaction in this zone, don't free any more.
  			 * Even though compaction is invoked for any
  			 * non-zero order, only frequent costly order
  			 * reclamation is disruptive enough to become a
  			 * noticeable problem, like transparent huge
  			 * page allocations.
  			 */
  			if (IS_ENABLED(CONFIG_COMPACTION) &&
  			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
4f588331b   Mel Gorman   mm, vmscan: avoid...
2818
  			    compaction_ready(zone, sc)) {
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2819
2820
  				sc->compaction_ready = true;
  				continue;
e0887c19b   Rik van Riel   vmscan: limit dir...
2821
  			}
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2822

0608f43da   Andrew Morton   revert "memcg, vm...
2823
  			/*
79dafcdca   Mel Gorman   mm, vmscan: by de...
2824
2825
2826
2827
2828
2829
2830
2831
2832
  			 * Shrink each node in the zonelist once. If the
  			 * zonelist is ordered by zone (not the default) then a
  			 * node may be shrunk multiple times but in that case
  			 * the user prefers lower zones being preserved.
  			 */
  			if (zone->zone_pgdat == last_pgdat)
  				continue;
  
  			/*
0608f43da   Andrew Morton   revert "memcg, vm...
2833
2834
2835
2836
2837
2838
  			 * This steals pages from memory cgroups over softlimit
  			 * and returns the number of reclaimed pages and
  			 * scanned pages. This works for global memory pressure
  			 * and balancing, not for a memcg's limit.
  			 */
  			nr_soft_scanned = 0;
ef8f23279   Mel Gorman   mm, memcg: move m...
2839
  			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
0608f43da   Andrew Morton   revert "memcg, vm...
2840
2841
2842
2843
  						sc->order, sc->gfp_mask,
  						&nr_soft_scanned);
  			sc->nr_reclaimed += nr_soft_reclaimed;
  			sc->nr_scanned += nr_soft_scanned;
ac34a1a3c   KAMEZAWA Hiroyuki   memcg: fix direct...
2844
  			/* need some check for avoid more shrink_zone() */
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2845
  		}
408d85441   Nick Piggin   [PATCH] oom: use ...
2846

79dafcdca   Mel Gorman   mm, vmscan: by de...
2847
2848
2849
2850
  		/* See comment about same check for global reclaim above */
  		if (zone->zone_pgdat == last_pgdat)
  			continue;
  		last_pgdat = zone->zone_pgdat;
970a39a36   Mel Gorman   mm, vmscan: avoid...
2851
  		shrink_node(zone->zone_pgdat, sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2852
  	}
e0c23279c   Mel Gorman   vmscan: abort rec...
2853

65ec02cb9   Vladimir Davydov   mm: vmscan: move ...
2854
  	/*
619d0d76c   Weijie Yang   mm/vmscan: restor...
2855
2856
2857
2858
  	 * Restore to original mask to avoid the impact on the caller if we
  	 * promoted it to __GFP_HIGHMEM.
  	 */
  	sc->gfp_mask = orig_mask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2859
  }
4f98a2fee   Rik van Riel   vmscan: split LRU...
2860

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
  static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
  {
  	struct mem_cgroup *memcg;
  
  	memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
  	do {
  		unsigned long refaults;
  		struct lruvec *lruvec;
  
  		if (memcg)
ccda7f436   Johannes Weiner   mm: memcontrol: u...
2871
  			refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2872
2873
2874
2875
2876
2877
2878
  		else
  			refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
  
  		lruvec = mem_cgroup_lruvec(pgdat, memcg);
  		lruvec->refaults = refaults;
  	} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2879
2880
2881
2882
2883
2884
2885
2886
  /*
   * This is the main entry point to direct page reclaim.
   *
   * If a full scan of the inactive list fails to free enough memory then we
   * are "out of memory" and something needs to be killed.
   *
   * If the caller is !__GFP_FS then the probability of a failure is reasonably
   * high - the zone may be full of dirty or under-writeback pages, which this
5b0830cb9   Jens Axboe   writeback: get ri...
2887
2888
2889
2890
   * caller can't do much about.  We kick the writeback threads and take explicit
   * naps in the hope that some of these pages can be written.  But if the
   * allocating task holds filesystem locks which prevent writeout this might not
   * work, and the allocation attempt will fail.
a41f24ea9   Nishanth Aravamudan   page allocator: s...
2891
2892
2893
   *
   * returns:	0, if no pages reclaimed
   * 		else, the number of pages reclaimed
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2894
   */
dac1d27bc   Mel Gorman   mm: use zonelists...
2895
  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3115cd914   Vladimir Davydov   mm: vmscan: remov...
2896
  					  struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2897
  {
241994ed8   Johannes Weiner   mm: memcontrol: d...
2898
  	int initial_priority = sc->priority;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2899
2900
2901
  	pg_data_t *last_pgdat;
  	struct zoneref *z;
  	struct zone *zone;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2902
  retry:
873b47717   Keika Kobayashi   per-task-delay-ac...
2903
  	delayacct_freepages_start();
89b5fae53   Johannes Weiner   mm: vmscan: disti...
2904
  	if (global_reclaim(sc))
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
2905
  		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2906

9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2907
  	do {
70ddf637e   Anton Vorontsov   memcg: add memory...
2908
2909
  		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
  				sc->priority);
66e1707bc   Balbir Singh   Memory controller...
2910
  		sc->nr_scanned = 0;
0a0337e0d   Michal Hocko   mm, oom: rework o...
2911
  		shrink_zones(zonelist, sc);
c6a8a8c58   KOSAKI Motohiro   vmscan: recalcula...
2912

bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2913
  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2914
2915
2916
2917
  			break;
  
  		if (sc->compaction_ready)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2918
2919
  
  		/*
0e50ce3b5   Minchan Kim   mm: use up free s...
2920
2921
2922
2923
2924
  		 * If we're getting trouble reclaiming, start doing
  		 * writepage even in laptop mode.
  		 */
  		if (sc->priority < DEF_PRIORITY - 2)
  			sc->may_writepage = 1;
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2925
  	} while (--sc->priority >= 0);
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2926

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2927
2928
2929
2930
2931
2932
2933
  	last_pgdat = NULL;
  	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
  					sc->nodemask) {
  		if (zone->zone_pgdat == last_pgdat)
  			continue;
  		last_pgdat = zone->zone_pgdat;
  		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2934
  		set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2935
  	}
873b47717   Keika Kobayashi   per-task-delay-ac...
2936
  	delayacct_freepages_end();
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2937
2938
  	if (sc->nr_reclaimed)
  		return sc->nr_reclaimed;
0cee34fd7   Mel Gorman   mm: vmscan: check...
2939
  	/* Aborted reclaim to try compaction? don't OOM, then */
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2940
  	if (sc->compaction_ready)
7335084d4   Mel Gorman   mm: vmscan: do no...
2941
  		return 1;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2942
  	/* Untapped cgroup reserves?  Don't OOM, retry. */
d6622f636   Yisheng Xie   mm/vmscan: more r...
2943
  	if (sc->memcg_low_skipped) {
241994ed8   Johannes Weiner   mm: memcontrol: d...
2944
  		sc->priority = initial_priority;
d6622f636   Yisheng Xie   mm/vmscan: more r...
2945
2946
  		sc->memcg_low_reclaim = 1;
  		sc->memcg_low_skipped = 0;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2947
2948
  		goto retry;
  	}
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2949
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2950
  }
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2951
  static bool allow_direct_reclaim(pg_data_t *pgdat)
5515061d2   Mel Gorman   mm: throttle dire...
2952
2953
2954
2955
2956
2957
  {
  	struct zone *zone;
  	unsigned long pfmemalloc_reserve = 0;
  	unsigned long free_pages = 0;
  	int i;
  	bool wmark_ok;
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2958
2959
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  		return true;
5515061d2   Mel Gorman   mm: throttle dire...
2960
2961
  	for (i = 0; i <= ZONE_NORMAL; i++) {
  		zone = &pgdat->node_zones[i];
d450abd81   Johannes Weiner   mm: fix check for...
2962
2963
2964
2965
  		if (!managed_zone(zone))
  			continue;
  
  		if (!zone_reclaimable_pages(zone))
675becce1   Mel Gorman   mm: vmscan: do no...
2966
  			continue;
5515061d2   Mel Gorman   mm: throttle dire...
2967
2968
2969
  		pfmemalloc_reserve += min_wmark_pages(zone);
  		free_pages += zone_page_state(zone, NR_FREE_PAGES);
  	}
675becce1   Mel Gorman   mm: vmscan: do no...
2970
2971
2972
  	/* If there are no reserves (unexpected config) then do not throttle */
  	if (!pfmemalloc_reserve)
  		return true;
5515061d2   Mel Gorman   mm: throttle dire...
2973
2974
2975
2976
  	wmark_ok = free_pages > pfmemalloc_reserve / 2;
  
  	/* kswapd must be awake if processes are being throttled */
  	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
38087d9b0   Mel Gorman   mm, vmscan: simpl...
2977
  		pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
5515061d2   Mel Gorman   mm: throttle dire...
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
  						(enum zone_type)ZONE_NORMAL);
  		wake_up_interruptible(&pgdat->kswapd_wait);
  	}
  
  	return wmark_ok;
  }
  
  /*
   * Throttle direct reclaimers if backing storage is backed by the network
   * and the PFMEMALLOC reserve for the preferred node is getting dangerously
   * depleted. kswapd will continue to make progress and wake the processes
50694c28f   Mel Gorman   mm: vmscan: check...
2989
2990
2991
2992
   * when the low watermark is reached.
   *
   * Returns true if a fatal signal was delivered during throttling. If this
   * happens, the page allocator should not consider triggering the OOM killer.
5515061d2   Mel Gorman   mm: throttle dire...
2993
   */
50694c28f   Mel Gorman   mm: vmscan: check...
2994
  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
5515061d2   Mel Gorman   mm: throttle dire...
2995
2996
  					nodemask_t *nodemask)
  {
675becce1   Mel Gorman   mm: vmscan: do no...
2997
  	struct zoneref *z;
5515061d2   Mel Gorman   mm: throttle dire...
2998
  	struct zone *zone;
675becce1   Mel Gorman   mm: vmscan: do no...
2999
  	pg_data_t *pgdat = NULL;
5515061d2   Mel Gorman   mm: throttle dire...
3000
3001
3002
3003
3004
3005
3006
3007
3008
  
  	/*
  	 * Kernel threads should not be throttled as they may be indirectly
  	 * responsible for cleaning pages necessary for reclaim to make forward
  	 * progress. kjournald for example may enter direct reclaim while
  	 * committing a transaction where throttling it could forcing other
  	 * processes to block on log_wait_commit().
  	 */
  	if (current->flags & PF_KTHREAD)
50694c28f   Mel Gorman   mm: vmscan: check...
3009
3010
3011
3012
3013
3014
3015
3016
  		goto out;
  
  	/*
  	 * If a fatal signal is pending, this process should not throttle.
  	 * It should return quickly so it can exit and free its memory
  	 */
  	if (fatal_signal_pending(current))
  		goto out;
5515061d2   Mel Gorman   mm: throttle dire...
3017

675becce1   Mel Gorman   mm: vmscan: do no...
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
  	/*
  	 * Check if the pfmemalloc reserves are ok by finding the first node
  	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
  	 * GFP_KERNEL will be required for allocating network buffers when
  	 * swapping over the network so ZONE_HIGHMEM is unusable.
  	 *
  	 * Throttling is based on the first usable node and throttled processes
  	 * wait on a queue until kswapd makes progress and wakes them. There
  	 * is an affinity then between processes waking up and where reclaim
  	 * progress has been made assuming the process wakes on the same node.
  	 * More importantly, processes running on remote nodes will not compete
  	 * for remote pfmemalloc reserves and processes on different nodes
  	 * should make reasonable progress.
  	 */
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
17636faad   Michael S. Tsirkin   mm/vmscan: fix hi...
3033
  					gfp_zone(gfp_mask), nodemask) {
675becce1   Mel Gorman   mm: vmscan: do no...
3034
3035
3036
3037
3038
  		if (zone_idx(zone) > ZONE_NORMAL)
  			continue;
  
  		/* Throttle based on the first usable node */
  		pgdat = zone->zone_pgdat;
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3039
  		if (allow_direct_reclaim(pgdat))
675becce1   Mel Gorman   mm: vmscan: do no...
3040
3041
3042
3043
3044
3045
  			goto out;
  		break;
  	}
  
  	/* If no zone was usable by the allocation flags then do not throttle */
  	if (!pgdat)
50694c28f   Mel Gorman   mm: vmscan: check...
3046
  		goto out;
5515061d2   Mel Gorman   mm: throttle dire...
3047

68243e76e   Mel Gorman   mm: account for t...
3048
3049
  	/* Account for the throttling */
  	count_vm_event(PGSCAN_DIRECT_THROTTLE);
5515061d2   Mel Gorman   mm: throttle dire...
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
  	/*
  	 * If the caller cannot enter the filesystem, it's possible that it
  	 * is due to the caller holding an FS lock or performing a journal
  	 * transaction in the case of a filesystem like ext[3|4]. In this case,
  	 * it is not safe to block on pfmemalloc_wait as kswapd could be
  	 * blocked waiting on the same lock. Instead, throttle for up to a
  	 * second before continuing.
  	 */
  	if (!(gfp_mask & __GFP_FS)) {
  		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3060
  			allow_direct_reclaim(pgdat), HZ);
50694c28f   Mel Gorman   mm: vmscan: check...
3061
3062
  
  		goto check_pending;
5515061d2   Mel Gorman   mm: throttle dire...
3063
3064
3065
3066
  	}
  
  	/* Throttle until kswapd wakes the process */
  	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3067
  		allow_direct_reclaim(pgdat));
50694c28f   Mel Gorman   mm: vmscan: check...
3068
3069
3070
3071
3072
3073
3074
  
  check_pending:
  	if (fatal_signal_pending(current))
  		return true;
  
  out:
  	return false;
5515061d2   Mel Gorman   mm: throttle dire...
3075
  }
dac1d27bc   Mel Gorman   mm: use zonelists...
3076
  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
327c0e968   KAMEZAWA Hiroyuki   vmscan: fix it to...
3077
  				gfp_t gfp_mask, nodemask_t *nodemask)
66e1707bc   Balbir Singh   Memory controller...
3078
  {
33906bc5c   Mel Gorman   vmscan: tracing: ...
3079
  	unsigned long nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3080
  	struct scan_control sc = {
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3081
  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3082
  		.gfp_mask = current_gfp_context(gfp_mask),
b2e18757f   Mel Gorman   mm, vmscan: begin...
3083
  		.reclaim_idx = gfp_zone(gfp_mask),
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3084
3085
3086
  		.order = order,
  		.nodemask = nodemask,
  		.priority = DEF_PRIORITY,
66e1707bc   Balbir Singh   Memory controller...
3087
  		.may_writepage = !laptop_mode,
a6dc60f89   Johannes Weiner   vmscan: rename sc...
3088
  		.may_unmap = 1,
2e2e42598   KOSAKI Motohiro   vmscan,memcg: rei...
3089
  		.may_swap = 1,
66e1707bc   Balbir Singh   Memory controller...
3090
  	};
5515061d2   Mel Gorman   mm: throttle dire...
3091
  	/*
bb451fdf3   Greg Thelen   mm/vmscan.c: cond...
3092
3093
3094
3095
3096
3097
3098
3099
  	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
  	 * Confirm they are large enough for max values.
  	 */
  	BUILD_BUG_ON(MAX_ORDER > S8_MAX);
  	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
  	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
  
  	/*
50694c28f   Mel Gorman   mm: vmscan: check...
3100
3101
3102
  	 * Do not enter reclaim if fatal signal was delivered while throttled.
  	 * 1 is returned so that the page allocator does not OOM kill at this
  	 * point.
5515061d2   Mel Gorman   mm: throttle dire...
3103
  	 */
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3104
  	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
5515061d2   Mel Gorman   mm: throttle dire...
3105
  		return 1;
33906bc5c   Mel Gorman   vmscan: tracing: ...
3106
3107
  	trace_mm_vmscan_direct_reclaim_begin(order,
  				sc.may_writepage,
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3108
  				sc.gfp_mask,
e5146b12e   Mel Gorman   mm, vmscan: add c...
3109
  				sc.reclaim_idx);
33906bc5c   Mel Gorman   vmscan: tracing: ...
3110

3115cd914   Vladimir Davydov   mm: vmscan: remov...
3111
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
33906bc5c   Mel Gorman   vmscan: tracing: ...
3112
3113
3114
3115
  
  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
  
  	return nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3116
  }
c255a4580   Andrew Morton   memcg: rename con...
3117
  #ifdef CONFIG_MEMCG
66e1707bc   Balbir Singh   Memory controller...
3118

a9dd0a831   Mel Gorman   mm, vmscan: make ...
3119
  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
4e4169535   Balbir Singh   memory controller...
3120
  						gfp_t gfp_mask, bool noswap,
ef8f23279   Mel Gorman   mm, memcg: move m...
3121
  						pg_data_t *pgdat,
0ae5e89c6   Ying Han   memcg: count the ...
3122
  						unsigned long *nr_scanned)
4e4169535   Balbir Singh   memory controller...
3123
3124
  {
  	struct scan_control sc = {
b8f5c5664   KOSAKI Motohiro   memcg: sc.nr_to_r...
3125
  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3126
  		.target_mem_cgroup = memcg,
4e4169535   Balbir Singh   memory controller...
3127
3128
  		.may_writepage = !laptop_mode,
  		.may_unmap = 1,
b2e18757f   Mel Gorman   mm, vmscan: begin...
3129
  		.reclaim_idx = MAX_NR_ZONES - 1,
4e4169535   Balbir Singh   memory controller...
3130
  		.may_swap = !noswap,
4e4169535   Balbir Singh   memory controller...
3131
  	};
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
3132
  	unsigned long lru_pages;
0ae5e89c6   Ying Han   memcg: count the ...
3133

4e4169535   Balbir Singh   memory controller...
3134
3135
  	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
  			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3136

9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3137
  	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3138
  						      sc.may_writepage,
e5146b12e   Mel Gorman   mm, vmscan: add c...
3139
3140
  						      sc.gfp_mask,
  						      sc.reclaim_idx);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3141

4e4169535   Balbir Singh   memory controller...
3142
3143
3144
  	/*
  	 * NOTE: Although we can get the priority field, using it
  	 * here is not a good idea, since it limits the pages we can scan.
a9dd0a831   Mel Gorman   mm, vmscan: make ...
3145
  	 * if we don't reclaim here, the shrink_node from balance_pgdat
4e4169535   Balbir Singh   memory controller...
3146
3147
3148
  	 * will pick up pages from other mem cgroup's as well. We hack
  	 * the priority and make it zero.
  	 */
ef8f23279   Mel Gorman   mm, memcg: move m...
3149
  	shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3150
3151
  
  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
0ae5e89c6   Ying Han   memcg: count the ...
3152
  	*nr_scanned = sc.nr_scanned;
4e4169535   Balbir Singh   memory controller...
3153
3154
  	return sc.nr_reclaimed;
  }
72835c86c   Johannes Weiner   mm: unify remaini...
3155
  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3156
  					   unsigned long nr_pages,
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3157
  					   gfp_t gfp_mask,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3158
  					   bool may_swap)
66e1707bc   Balbir Singh   Memory controller...
3159
  {
4e4169535   Balbir Singh   memory controller...
3160
  	struct zonelist *zonelist;
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3161
  	unsigned long nr_reclaimed;
889976dbc   Ying Han   memcg: reclaim me...
3162
  	int nid;
499118e96   Vlastimil Babka   mm: introduce mem...
3163
  	unsigned int noreclaim_flag;
66e1707bc   Balbir Singh   Memory controller...
3164
  	struct scan_control sc = {
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3165
  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7dea19f9e   Michal Hocko   mm: introduce mem...
3166
  		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
a09ed5e00   Ying Han   vmscan: change sh...
3167
  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
b2e18757f   Mel Gorman   mm, vmscan: begin...
3168
  		.reclaim_idx = MAX_NR_ZONES - 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3169
3170
3171
3172
  		.target_mem_cgroup = memcg,
  		.priority = DEF_PRIORITY,
  		.may_writepage = !laptop_mode,
  		.may_unmap = 1,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3173
  		.may_swap = may_swap,
a09ed5e00   Ying Han   vmscan: change sh...
3174
  	};
66e1707bc   Balbir Singh   Memory controller...
3175

889976dbc   Ying Han   memcg: reclaim me...
3176
3177
3178
3179
3180
  	/*
  	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
  	 * take care of from where we get pages. So the node where we start the
  	 * scan does not need to be the current node.
  	 */
72835c86c   Johannes Weiner   mm: unify remaini...
3181
  	nid = mem_cgroup_select_victim_node(memcg);
889976dbc   Ying Han   memcg: reclaim me...
3182

c9634cf01   Aneesh Kumar K.V   mm: use zonelist ...
3183
  	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3184
3185
3186
  
  	trace_mm_vmscan_memcg_reclaim_begin(0,
  					    sc.may_writepage,
e5146b12e   Mel Gorman   mm, vmscan: add c...
3187
3188
  					    sc.gfp_mask,
  					    sc.reclaim_idx);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3189

499118e96   Vlastimil Babka   mm: introduce mem...
3190
  	noreclaim_flag = memalloc_noreclaim_save();
3115cd914   Vladimir Davydov   mm: vmscan: remov...
3191
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
499118e96   Vlastimil Babka   mm: introduce mem...
3192
  	memalloc_noreclaim_restore(noreclaim_flag);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3193
3194
3195
3196
  
  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
  
  	return nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3197
3198
  }
  #endif
1d82de618   Mel Gorman   mm, vmscan: make ...
3199
  static void age_active_anon(struct pglist_data *pgdat,
ef8f23279   Mel Gorman   mm, memcg: move m...
3200
  				struct scan_control *sc)
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3201
  {
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3202
  	struct mem_cgroup *memcg;
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3203

b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3204
3205
3206
3207
3208
  	if (!total_swap_pages)
  		return;
  
  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
  	do {
ef8f23279   Mel Gorman   mm, memcg: move m...
3209
  		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3210

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
3211
  		if (inactive_list_is_low(lruvec, false, memcg, sc, true))
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
3212
  			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3213
  					   sc, LRU_ACTIVE_ANON);
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3214
3215
3216
  
  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
  	} while (memcg);
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3217
  }
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3218
3219
3220
3221
3222
  /*
   * Returns true if there is an eligible zone balanced for the request order
   * and classzone_idx
   */
  static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3223
  {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3224
3225
3226
  	int i;
  	unsigned long mark = -1;
  	struct zone *zone;
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3227

e716f2eb2   Mel Gorman   mm, vmscan: preve...
3228
3229
  	for (i = 0; i <= classzone_idx; i++) {
  		zone = pgdat->node_zones + i;
6256c6b49   Mel Gorman   mm, vmscan: remov...
3230

e716f2eb2   Mel Gorman   mm, vmscan: preve...
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
  		if (!managed_zone(zone))
  			continue;
  
  		mark = high_wmark_pages(zone);
  		if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
  			return true;
  	}
  
  	/*
  	 * If a node has no populated zone within classzone_idx, it does not
  	 * need balancing by definition. This can happen if a zone-restricted
  	 * allocation tries to wake a remote kswapd.
  	 */
  	if (mark == -1)
  		return true;
  
  	return false;
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3248
  }
631b6e083   Mel Gorman   mm, vmscan: only ...
3249
3250
3251
3252
3253
3254
3255
  /* Clear pgdat state for congested, dirty or under writeback. */
  static void clear_pgdat_congested(pg_data_t *pgdat)
  {
  	clear_bit(PGDAT_CONGESTED, &pgdat->flags);
  	clear_bit(PGDAT_DIRTY, &pgdat->flags);
  	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
  }
1741c8775   Mel Gorman   mm: kswapd: keep ...
3256
  /*
5515061d2   Mel Gorman   mm: throttle dire...
3257
3258
3259
3260
3261
   * Prepare kswapd for sleeping. This verifies that there are no processes
   * waiting in throttle_direct_reclaim() and that watermarks have been met.
   *
   * Returns true if kswapd is ready to sleep
   */
d9f21d426   Mel Gorman   mm, vmscan: avoid...
3262
  static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
f50de2d38   Mel Gorman   vmscan: have kswa...
3263
  {
5515061d2   Mel Gorman   mm: throttle dire...
3264
  	/*
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3265
  	 * The throttled processes are normally woken up in balance_pgdat() as
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3266
  	 * soon as allow_direct_reclaim() is true. But there is a potential
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3267
3268
3269
3270
3271
3272
3273
3274
3275
  	 * race between when kswapd checks the watermarks and a process gets
  	 * throttled. There is also a potential race if processes get
  	 * throttled, kswapd wakes, a large process exits thereby balancing the
  	 * zones, which causes kswapd to exit balance_pgdat() before reaching
  	 * the wake up checks. If kswapd is going to sleep, no process should
  	 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
  	 * the wake up is premature, processes will wake kswapd and get
  	 * throttled again. The difference from wake ups in balance_pgdat() is
  	 * that here we are under prepare_to_wait().
5515061d2   Mel Gorman   mm: throttle dire...
3276
  	 */
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3277
3278
  	if (waitqueue_active(&pgdat->pfmemalloc_wait))
  		wake_up_all(&pgdat->pfmemalloc_wait);
f50de2d38   Mel Gorman   vmscan: have kswa...
3279

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3280
3281
3282
  	/* Hopeless node, leave it to direct reclaim */
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  		return true;
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3283
3284
3285
  	if (pgdat_balanced(pgdat, order, classzone_idx)) {
  		clear_pgdat_congested(pgdat);
  		return true;
1d82de618   Mel Gorman   mm, vmscan: make ...
3286
  	}
333b0a459   Shantanu Goel   mm, vmscan: fix z...
3287
  	return false;
f50de2d38   Mel Gorman   vmscan: have kswa...
3288
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3289
  /*
1d82de618   Mel Gorman   mm, vmscan: make ...
3290
3291
   * kswapd shrinks a node of pages that are at or below the highest usable
   * zone that is currently unbalanced.
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3292
3293
   *
   * Returns true if kswapd scanned at least the requested number of pages to
283aba9f9   Mel Gorman   mm: vmscan: block...
3294
3295
   * reclaim or if the lack of progress was due to pages under writeback.
   * This is used to determine if the scanning priority needs to be raised.
75485363c   Mel Gorman   mm: vmscan: limit...
3296
   */
1d82de618   Mel Gorman   mm, vmscan: make ...
3297
  static bool kswapd_shrink_node(pg_data_t *pgdat,
accf62422   Vlastimil Babka   mm, kswapd: repla...
3298
  			       struct scan_control *sc)
75485363c   Mel Gorman   mm: vmscan: limit...
3299
  {
1d82de618   Mel Gorman   mm, vmscan: make ...
3300
3301
  	struct zone *zone;
  	int z;
75485363c   Mel Gorman   mm: vmscan: limit...
3302

1d82de618   Mel Gorman   mm, vmscan: make ...
3303
3304
  	/* Reclaim a number of pages proportional to the number of zones */
  	sc->nr_to_reclaim = 0;
970a39a36   Mel Gorman   mm, vmscan: avoid...
3305
  	for (z = 0; z <= sc->reclaim_idx; z++) {
1d82de618   Mel Gorman   mm, vmscan: make ...
3306
  		zone = pgdat->node_zones + z;
6aa303def   Mel Gorman   mm, vmscan: only ...
3307
  		if (!managed_zone(zone))
1d82de618   Mel Gorman   mm, vmscan: make ...
3308
  			continue;
7c954f6de   Mel Gorman   mm: vmscan: move ...
3309

1d82de618   Mel Gorman   mm, vmscan: make ...
3310
3311
  		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
  	}
7c954f6de   Mel Gorman   mm: vmscan: move ...
3312
3313
  
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3314
3315
  	 * Historically care was taken to put equal pressure on all zones but
  	 * now pressure is applied based on node LRU order.
7c954f6de   Mel Gorman   mm: vmscan: move ...
3316
  	 */
970a39a36   Mel Gorman   mm, vmscan: avoid...
3317
  	shrink_node(pgdat, sc);
283aba9f9   Mel Gorman   mm: vmscan: block...
3318

7c954f6de   Mel Gorman   mm: vmscan: move ...
3319
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3320
3321
3322
3323
3324
  	 * Fragmentation may mean that the system cannot be rebalanced for
  	 * high-order allocations. If twice the allocation size has been
  	 * reclaimed then recheck watermarks only at order-0 to prevent
  	 * excessive reclaim. Assume that a process requested a high-order
  	 * can direct reclaim/compact.
7c954f6de   Mel Gorman   mm: vmscan: move ...
3325
  	 */
9861a62c3   Vlastimil Babka   mm, compaction: c...
3326
  	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
1d82de618   Mel Gorman   mm, vmscan: make ...
3327
  		sc->order = 0;
7c954f6de   Mel Gorman   mm: vmscan: move ...
3328

b8e83b942   Mel Gorman   mm: vmscan: flatt...
3329
  	return sc->nr_scanned >= sc->nr_to_reclaim;
75485363c   Mel Gorman   mm: vmscan: limit...
3330
3331
3332
  }
  
  /*
1d82de618   Mel Gorman   mm, vmscan: make ...
3333
3334
3335
   * For kswapd, balance_pgdat() will reclaim pages across a node from zones
   * that are eligible for use by the caller until at least one zone is
   * balanced.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3336
   *
1d82de618   Mel Gorman   mm, vmscan: make ...
3337
   * Returns the order kswapd finished reclaiming at.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3338
3339
   *
   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
418589663   Mel Gorman   page allocator: u...
3340
   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1d82de618   Mel Gorman   mm, vmscan: make ...
3341
3342
3343
   * found to have free_pages <= high_wmark_pages(zone), any page is that zone
   * or lower is eligible for reclaim until at least one usable zone is
   * balanced.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3344
   */
accf62422   Vlastimil Babka   mm, kswapd: repla...
3345
  static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3346
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3347
  	int i;
0608f43da   Andrew Morton   revert "memcg, vm...
3348
3349
  	unsigned long nr_soft_reclaimed;
  	unsigned long nr_soft_scanned;
1d82de618   Mel Gorman   mm, vmscan: make ...
3350
  	struct zone *zone;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3351
3352
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3353
  		.order = order,
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3354
  		.priority = DEF_PRIORITY,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3355
  		.may_writepage = !laptop_mode,
a6dc60f89   Johannes Weiner   vmscan: rename sc...
3356
  		.may_unmap = 1,
2e2e42598   KOSAKI Motohiro   vmscan,memcg: rei...
3357
  		.may_swap = 1,
179e96395   Andrew Morton   [PATCH] vmscan: s...
3358
  	};
93781325d   Omar Sandoval   lockdep: fix fs_r...
3359
3360
  
  	__fs_reclaim_acquire();
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
3361
  	count_vm_event(PAGEOUTRUN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3362

9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3363
  	do {
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3364
  		unsigned long nr_reclaimed = sc.nr_reclaimed;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3365
  		bool raise_priority = true;
93781325d   Omar Sandoval   lockdep: fix fs_r...
3366
  		bool ret;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3367

84c7a7771   Mel Gorman   mm, vmscan: Have ...
3368
  		sc.reclaim_idx = classzone_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3369

86c79f6b5   Mel Gorman   mm: vmscan: do no...
3370
  		/*
84c7a7771   Mel Gorman   mm, vmscan: Have ...
3371
3372
3373
3374
3375
3376
3377
3378
  		 * If the number of buffer_heads exceeds the maximum allowed
  		 * then consider reclaiming from all zones. This has a dual
  		 * purpose -- on 64-bit systems it is expected that
  		 * buffer_heads are stripped during active rotation. On 32-bit
  		 * systems, highmem pages can pin lowmem memory and shrinking
  		 * buffers can relieve lowmem pressure. Reclaim may still not
  		 * go ahead if all eligible zones for the original allocation
  		 * request are balanced to avoid excessive reclaim from kswapd.
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3379
3380
3381
3382
  		 */
  		if (buffer_heads_over_limit) {
  			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
  				zone = pgdat->node_zones + i;
6aa303def   Mel Gorman   mm, vmscan: only ...
3383
  				if (!managed_zone(zone))
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3384
  					continue;
cc715d99e   Mel Gorman   mm: vmscan: forci...
3385

970a39a36   Mel Gorman   mm, vmscan: avoid...
3386
  				sc.reclaim_idx = i;
e1dbeda60   Andrew Morton   [PATCH] balance_p...
3387
  				break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3388
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3389
  		}
dafcb73e3   Zlatko Calusic   mm: avoid calling...
3390

86c79f6b5   Mel Gorman   mm: vmscan: do no...
3391
  		/*
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3392
3393
3394
  		 * Only reclaim if there are no eligible zones. Note that
  		 * sc.reclaim_idx is not used as buffer_heads_over_limit may
  		 * have adjusted it.
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3395
  		 */
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3396
3397
  		if (pgdat_balanced(pgdat, sc.order, classzone_idx))
  			goto out;
e1dbeda60   Andrew Morton   [PATCH] balance_p...
3398

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3399
  		/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3400
3401
3402
3403
3404
  		 * Do some background aging of the anon list, to give
  		 * pages a chance to be referenced before reclaiming. All
  		 * pages are rotated regardless of classzone as this is
  		 * about consistent aging.
  		 */
ef8f23279   Mel Gorman   mm, memcg: move m...
3405
  		age_active_anon(pgdat, &sc);
1d82de618   Mel Gorman   mm, vmscan: make ...
3406
3407
  
  		/*
b7ea3c417   Mel Gorman   mm: vmscan: check...
3408
3409
3410
  		 * If we're getting trouble reclaiming, start doing writepage
  		 * even in laptop mode.
  		 */
047d72c30   Johannes Weiner   mm: remove seemin...
3411
  		if (sc.priority < DEF_PRIORITY - 2)
b7ea3c417   Mel Gorman   mm: vmscan: check...
3412
  			sc.may_writepage = 1;
1d82de618   Mel Gorman   mm, vmscan: make ...
3413
3414
3415
  		/* Call soft limit reclaim before calling shrink_node. */
  		sc.nr_scanned = 0;
  		nr_soft_scanned = 0;
ef8f23279   Mel Gorman   mm, memcg: move m...
3416
  		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
1d82de618   Mel Gorman   mm, vmscan: make ...
3417
3418
  						sc.gfp_mask, &nr_soft_scanned);
  		sc.nr_reclaimed += nr_soft_reclaimed;
b7ea3c417   Mel Gorman   mm: vmscan: check...
3419
  		/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3420
3421
3422
  		 * There should be no need to raise the scanning priority if
  		 * enough pages are already being scanned that that high
  		 * watermark would be met at 100% efficiency.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3423
  		 */
970a39a36   Mel Gorman   mm, vmscan: avoid...
3424
  		if (kswapd_shrink_node(pgdat, &sc))
1d82de618   Mel Gorman   mm, vmscan: make ...
3425
  			raise_priority = false;
5515061d2   Mel Gorman   mm: throttle dire...
3426
3427
3428
3429
3430
3431
3432
  
  		/*
  		 * If the low watermark is met there is no need for processes
  		 * to be throttled on pfmemalloc_wait as they should not be
  		 * able to safely make forward progress. Wake them
  		 */
  		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3433
  				allow_direct_reclaim(pgdat))
cfc511557   Vlastimil Babka   mm, vmscan: wake ...
3434
  			wake_up_all(&pgdat->pfmemalloc_wait);
5515061d2   Mel Gorman   mm: throttle dire...
3435

b8e83b942   Mel Gorman   mm: vmscan: flatt...
3436
  		/* Check if kswapd should be suspending */
93781325d   Omar Sandoval   lockdep: fix fs_r...
3437
3438
3439
3440
  		__fs_reclaim_release();
  		ret = try_to_freeze();
  		__fs_reclaim_acquire();
  		if (ret || kthread_should_stop())
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3441
  			break;
8357376d3   Rafael J. Wysocki   [PATCH] swsusp: I...
3442

73ce02e96   KOSAKI Motohiro   mm: stop kswapd's...
3443
  		/*
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3444
3445
  		 * Raise priority if scanning rate is too low or there was no
  		 * progress in reclaiming pages
73ce02e96   KOSAKI Motohiro   mm: stop kswapd's...
3446
  		 */
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3447
3448
  		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
  		if (raise_priority || !nr_reclaimed)
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3449
  			sc.priority--;
1d82de618   Mel Gorman   mm, vmscan: make ...
3450
  	} while (sc.priority >= 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3451

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3452
3453
  	if (!sc.nr_reclaimed)
  		pgdat->kswapd_failures++;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3454
  out:
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
3455
  	snapshot_refaults(NULL, pgdat);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3456
  	__fs_reclaim_release();
0abdee2bd   Mel Gorman   mm: kswapd: use t...
3457
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3458
3459
3460
3461
  	 * Return the order kswapd stopped reclaiming at as
  	 * prepare_kswapd_sleep() takes it into account. If another caller
  	 * entered the allocator slow path while kswapd was awake, order will
  	 * remain at the higher level.
0abdee2bd   Mel Gorman   mm: kswapd: use t...
3462
  	 */
1d82de618   Mel Gorman   mm, vmscan: make ...
3463
  	return sc.order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3464
  }
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
  /*
   * pgdat->kswapd_classzone_idx is the highest zone index that a recent
   * allocation request woke kswapd for. When kswapd has not woken recently,
   * the value is MAX_NR_ZONES which is not a valid index. This compares a
   * given classzone and returns it or the highest classzone index kswapd
   * was recently woke for.
   */
  static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
  					   enum zone_type classzone_idx)
  {
  	if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
  		return classzone_idx;
  
  	return max(pgdat->kswapd_classzone_idx, classzone_idx);
  }
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3480
3481
  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
  				unsigned int classzone_idx)
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3482
3483
3484
3485
3486
3487
3488
3489
  {
  	long remaining = 0;
  	DEFINE_WAIT(wait);
  
  	if (freezing(current) || kthread_should_stop())
  		return;
  
  	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
333b0a459   Shantanu Goel   mm, vmscan: fix z...
3490
3491
3492
3493
3494
3495
3496
  	/*
  	 * Try to sleep for a short interval. Note that kcompactd will only be
  	 * woken if it is possible to sleep for a short interval. This is
  	 * deliberate on the assumption that if reclaim cannot keep an
  	 * eligible zone balanced that it's also unlikely that compaction will
  	 * succeed.
  	 */
d9f21d426   Mel Gorman   mm, vmscan: avoid...
3497
  	if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
fd901c953   Vlastimil Babka   mm: wake kcompact...
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
  		/*
  		 * Compaction records what page blocks it recently failed to
  		 * isolate pages from and skips them in the future scanning.
  		 * When kswapd is going to sleep, it is reasonable to assume
  		 * that pages and compaction may succeed so reset the cache.
  		 */
  		reset_isolation_suitable(pgdat);
  
  		/*
  		 * We have freed the memory, now we should compact it to make
  		 * allocation of the requested order possible.
  		 */
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3510
  		wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
fd901c953   Vlastimil Babka   mm: wake kcompact...
3511

f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3512
  		remaining = schedule_timeout(HZ/10);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3513
3514
3515
3516
3517
3518
3519
  
  		/*
  		 * If woken prematurely then reset kswapd_classzone_idx and
  		 * order. The values will either be from a wakeup request or
  		 * the previous request that slept prematurely.
  		 */
  		if (remaining) {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3520
  			pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3521
3522
  			pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
  		}
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3523
3524
3525
3526
3527
3528
3529
3530
  		finish_wait(&pgdat->kswapd_wait, &wait);
  		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  	}
  
  	/*
  	 * After a short sleep, check if it was a premature sleep. If not, then
  	 * go fully to sleep until explicitly woken up.
  	 */
d9f21d426   Mel Gorman   mm, vmscan: avoid...
3531
3532
  	if (!remaining &&
  	    prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
  		/*
  		 * vmstat counters are not perfectly accurate and the estimated
  		 * value for counters such as NR_FREE_PAGES can deviate from the
  		 * true value by nr_online_cpus * threshold. To avoid the zone
  		 * watermarks being breached while under pressure, we reduce the
  		 * per-cpu vmstat threshold while kswapd is awake and restore
  		 * them before going back to sleep.
  		 */
  		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
1c7e7f6c0   Aaditya Kumar   mm: fix lost kswa...
3544
3545
3546
  
  		if (!kthread_should_stop())
  			schedule();
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3547
3548
3549
3550
3551
3552
3553
3554
3555
  		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
  	} else {
  		if (remaining)
  			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
  		else
  			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
  	}
  	finish_wait(&pgdat->kswapd_wait, &wait);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3556
3557
  /*
   * The background pageout daemon, started as a kernel thread
4f98a2fee   Rik van Riel   vmscan: split LRU...
3558
   * from the init process.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
   *
   * This basically trickles out pages so that we have _some_
   * free memory available even if there is no other activity
   * that frees anything up. This is needed for things like routing
   * etc, where we otherwise might have all activity going on in
   * asynchronous contexts that cannot page things out.
   *
   * If there are applications that are active memory-allocators
   * (most normal use), this basically shouldn't matter.
   */
  static int kswapd(void *p)
  {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3571
3572
  	unsigned int alloc_order, reclaim_order;
  	unsigned int classzone_idx = MAX_NR_ZONES - 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3573
3574
  	pg_data_t *pgdat = (pg_data_t*)p;
  	struct task_struct *tsk = current;
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3575

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3576
3577
3578
  	struct reclaim_state reclaim_state = {
  		.reclaimed_slab = 0,
  	};
a70f73028   Rusty Russell   cpumask: replace ...
3579
  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3580

174596a0b   Rusty Russell   cpumask: convert mm/
3581
  	if (!cpumask_empty(cpumask))
c5f59f083   Mike Travis   nodemask: use new...
3582
  		set_cpus_allowed_ptr(tsk, cpumask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
  	current->reclaim_state = &reclaim_state;
  
  	/*
  	 * Tell the memory management that we're a "memory allocator",
  	 * and that if we need more memory we should get access to it
  	 * regardless (see "__alloc_pages()"). "kswapd" should
  	 * never get caught in the normal page freeing logic.
  	 *
  	 * (Kswapd normally doesn't need memory anyway, but sometimes
  	 * you need a small amount of memory in order to be able to
  	 * page out something else, and this flag essentially protects
  	 * us from recursively trying to free more memory as we're
  	 * trying to free the first piece of memory in the first place).
  	 */
930d91525   Christoph Lameter   [PATCH] Swap Migr...
3597
  	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
831441862   Rafael J. Wysocki   Freezer: make ker...
3598
  	set_freezable();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3599

e716f2eb2   Mel Gorman   mm, vmscan: preve...
3600
3601
  	pgdat->kswapd_order = 0;
  	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3602
  	for ( ; ; ) {
6f6313d48   Jeff Liu   mm/vmscan.c: try_...
3603
  		bool ret;
3e1d1d28d   Christoph Lameter   [PATCH] Cleanup p...
3604

e716f2eb2   Mel Gorman   mm, vmscan: preve...
3605
3606
  		alloc_order = reclaim_order = pgdat->kswapd_order;
  		classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3607
3608
3609
  kswapd_try_sleep:
  		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
  					classzone_idx);
215ddd666   Mel Gorman   mm: vmscan: only ...
3610

38087d9b0   Mel Gorman   mm, vmscan: simpl...
3611
3612
  		/* Read the new order and classzone_idx */
  		alloc_order = reclaim_order = pgdat->kswapd_order;
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3613
  		classzone_idx = kswapd_classzone_idx(pgdat, 0);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3614
  		pgdat->kswapd_order = 0;
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3615
  		pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3616

8fe23e057   David Rientjes   mm: clear node in...
3617
3618
3619
3620
3621
3622
3623
3624
  		ret = try_to_freeze();
  		if (kthread_should_stop())
  			break;
  
  		/*
  		 * We can speed up thawing tasks if we don't call balance_pgdat
  		 * after returning from the refrigerator
  		 */
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
  		if (ret)
  			continue;
  
  		/*
  		 * Reclaim begins at the requested order but if a high-order
  		 * reclaim fails then kswapd falls back to reclaiming for
  		 * order-0. If that happens, kswapd will consider sleeping
  		 * for the order it finished reclaiming at (reclaim_order)
  		 * but kcompactd is woken to compact for the original
  		 * request (alloc_order).
  		 */
e5146b12e   Mel Gorman   mm, vmscan: add c...
3636
3637
  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
  						alloc_order);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3638
3639
3640
  		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
  		if (reclaim_order < alloc_order)
  			goto kswapd_try_sleep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3641
  	}
b0a8cc58e   Takamori Yamaguchi   mm: bugfix: set c...
3642

71abdc15a   Johannes Weiner   mm: vmscan: clear...
3643
  	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
b0a8cc58e   Takamori Yamaguchi   mm: bugfix: set c...
3644
  	current->reclaim_state = NULL;
71abdc15a   Johannes Weiner   mm: vmscan: clear...
3645

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3646
3647
3648
3649
  	return 0;
  }
  
  /*
5ecd9d403   David Rientjes   mm, page_alloc: w...
3650
3651
3652
3653
3654
   * A zone is low on free memory or too fragmented for high-order memory.  If
   * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
   * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
   * has failed or is not needed, still wake up kcompactd if only compaction is
   * needed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3655
   */
5ecd9d403   David Rientjes   mm, page_alloc: w...
3656
3657
  void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
  		   enum zone_type classzone_idx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3658
3659
  {
  	pg_data_t *pgdat;
6aa303def   Mel Gorman   mm, vmscan: only ...
3660
  	if (!managed_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3661
  		return;
5ecd9d403   David Rientjes   mm, page_alloc: w...
3662
  	if (!cpuset_zone_allowed(zone, gfp_flags))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3663
  		return;
88f5acf88   Mel Gorman   mm: page allocato...
3664
  	pgdat = zone->zone_pgdat;
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3665
3666
  	pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
  							   classzone_idx);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3667
  	pgdat->kswapd_order = max(pgdat->kswapd_order, order);
8d0986e28   Con Kolivas   [PATCH] vm: kswap...
3668
  	if (!waitqueue_active(&pgdat->kswapd_wait))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3669
  		return;
e1a556374   Mel Gorman   mm, vmscan: only ...
3670

5ecd9d403   David Rientjes   mm, page_alloc: w...
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
  	/* Hopeless node, leave it to direct reclaim if possible */
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
  	    pgdat_balanced(pgdat, order, classzone_idx)) {
  		/*
  		 * There may be plenty of free memory available, but it's too
  		 * fragmented for high-order allocations.  Wake up kcompactd
  		 * and rely on compaction_suitable() to determine if it's
  		 * needed.  If it fails, it will defer subsequent attempts to
  		 * ratelimit its work.
  		 */
  		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
  			wakeup_kcompactd(pgdat, order, classzone_idx);
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3683
  		return;
5ecd9d403   David Rientjes   mm, page_alloc: w...
3684
  	}
88f5acf88   Mel Gorman   mm: page allocato...
3685

5ecd9d403   David Rientjes   mm, page_alloc: w...
3686
3687
  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
  				      gfp_flags);
8d0986e28   Con Kolivas   [PATCH] vm: kswap...
3688
  	wake_up_interruptible(&pgdat->kswapd_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3689
  }
c6f37f121   Rafael J. Wysocki   PM/Suspend: Do no...
3690
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3691
  /*
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3692
   * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3693
3694
3695
3696
3697
   * freed pages.
   *
   * Rather than trying to age LRUs the aim is to preserve the overall
   * LRU order by reclaiming preferentially
   * inactive > active > active referenced > active mapped
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3698
   */
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3699
  unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3700
  {
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3701
  	struct reclaim_state reclaim_state;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3702
  	struct scan_control sc = {
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3703
  		.nr_to_reclaim = nr_to_reclaim,
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3704
  		.gfp_mask = GFP_HIGHUSER_MOVABLE,
b2e18757f   Mel Gorman   mm, vmscan: begin...
3705
  		.reclaim_idx = MAX_NR_ZONES - 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3706
  		.priority = DEF_PRIORITY,
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3707
  		.may_writepage = 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3708
3709
  		.may_unmap = 1,
  		.may_swap = 1,
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3710
  		.hibernation_mode = 1,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3711
  	};
a09ed5e00   Ying Han   vmscan: change sh...
3712
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3713
3714
  	struct task_struct *p = current;
  	unsigned long nr_reclaimed;
499118e96   Vlastimil Babka   mm: introduce mem...
3715
  	unsigned int noreclaim_flag;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3716

d92a8cfcb   Peter Zijlstra   locking/lockdep: ...
3717
  	fs_reclaim_acquire(sc.gfp_mask);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3718
  	noreclaim_flag = memalloc_noreclaim_save();
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3719
3720
  	reclaim_state.reclaimed_slab = 0;
  	p->reclaim_state = &reclaim_state;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3721

3115cd914   Vladimir Davydov   mm: vmscan: remov...
3722
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
d979677c4   MinChan Kim   mm: shrink_all_me...
3723

7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3724
  	p->reclaim_state = NULL;
499118e96   Vlastimil Babka   mm: introduce mem...
3725
  	memalloc_noreclaim_restore(noreclaim_flag);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3726
  	fs_reclaim_release(sc.gfp_mask);
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3727

7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3728
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3729
  }
c6f37f121   Rafael J. Wysocki   PM/Suspend: Do no...
3730
  #endif /* CONFIG_HIBERNATION */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3731

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3732
3733
3734
3735
  /* It's optimal to keep kswapds on the same CPUs as their memory, but
     not required for correctness.  So if the last cpu in a node goes
     away, we get changed to run anywhere: as the first one comes back,
     restore their cpu bindings. */
517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3736
  static int kswapd_cpu_online(unsigned int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3737
  {
58c0a4a78   Yasunori Goto   Fix panic of cpu ...
3738
  	int nid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3739

517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3740
3741
3742
  	for_each_node_state(nid, N_MEMORY) {
  		pg_data_t *pgdat = NODE_DATA(nid);
  		const struct cpumask *mask;
a70f73028   Rusty Russell   cpumask: replace ...
3743

517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3744
  		mask = cpumask_of_node(pgdat->node_id);
c5f59f083   Mike Travis   nodemask: use new...
3745

517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3746
3747
3748
  		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
  			/* One of our CPUs online: restore mask */
  			set_cpus_allowed_ptr(pgdat->kswapd, mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3749
  	}
517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3750
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3751
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3752

3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
  /*
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
   */
  int kswapd_run(int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  	int ret = 0;
  
  	if (pgdat->kswapd)
  		return 0;
  
  	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
  	if (IS_ERR(pgdat->kswapd)) {
  		/* failure at boot is fatal */
c6202adf3   Thomas Gleixner   mm/vmscan: Adjust...
3768
  		BUG_ON(system_state < SYSTEM_RUNNING);
d5dc0ad92   Gavin Shan   mm/vmscan: fix er...
3769
3770
3771
  		pr_err("Failed to start kswapd on node %d
  ", nid);
  		ret = PTR_ERR(pgdat->kswapd);
d72515b85   Xishi Qiu   mm/vmscan: fix er...
3772
  		pgdat->kswapd = NULL;
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3773
3774
3775
  	}
  	return ret;
  }
8fe23e057   David Rientjes   mm: clear node in...
3776
  /*
d8adde17e   Jiang Liu   memory hotplug: f...
3777
   * Called by memory hotplug when all memory in a node is offlined.  Caller must
bfc8c9013   Vladimir Davydov   mem-hotplug: impl...
3778
   * hold mem_hotplug_begin/end().
8fe23e057   David Rientjes   mm: clear node in...
3779
3780
3781
3782
   */
  void kswapd_stop(int nid)
  {
  	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
d8adde17e   Jiang Liu   memory hotplug: f...
3783
  	if (kswapd) {
8fe23e057   David Rientjes   mm: clear node in...
3784
  		kthread_stop(kswapd);
d8adde17e   Jiang Liu   memory hotplug: f...
3785
3786
  		NODE_DATA(nid)->kswapd = NULL;
  	}
8fe23e057   David Rientjes   mm: clear node in...
3787
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3788
3789
  static int __init kswapd_init(void)
  {
517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3790
  	int nid, ret;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
3791

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3792
  	swap_setup();
48fb2e240   Lai Jiangshan   vmscan: use N_MEM...
3793
  	for_each_node_state(nid, N_MEMORY)
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3794
   		kswapd_run(nid);
517bbed90   Sebastian Andrzej Siewior   mm/vmscan: Conver...
3795
3796
3797
3798
  	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
  					"mm/vmscan:online", kswapd_cpu_online,
  					NULL);
  	WARN_ON(ret < 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3799
3800
3801
3802
  	return 0;
  }
  
  module_init(kswapd_init)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3803
3804
3805
  
  #ifdef CONFIG_NUMA
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3806
   * Node reclaim mode
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3807
   *
a5f5f91da   Mel Gorman   mm: convert zone_...
3808
   * If non-zero call node_reclaim when the number of free pages falls below
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3809
   * the watermarks.
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3810
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3811
  int node_reclaim_mode __read_mostly;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3812

1b2ffb789   Christoph Lameter   [PATCH] Zone recl...
3813
  #define RECLAIM_OFF 0
7d03431cf   Fernando Luis Vazquez Cao   swapfile/vmscan: ...
3814
  #define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
1b2ffb789   Christoph Lameter   [PATCH] Zone recl...
3815
  #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
3816
  #define RECLAIM_UNMAP (1<<2)	/* Unmap pages during reclaim */
1b2ffb789   Christoph Lameter   [PATCH] Zone recl...
3817

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3818
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3819
   * Priority for NODE_RECLAIM. This determines the fraction of pages
a92f71263   Christoph Lameter   [PATCH] zone_recl...
3820
3821
3822
   * of a node considered for each zone_reclaim. 4 scans 1/16th of
   * a zone.
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3823
  #define NODE_RECLAIM_PRIORITY 4
a92f71263   Christoph Lameter   [PATCH] zone_recl...
3824

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3825
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3826
   * Percentage of pages in a zone that must be unmapped for node_reclaim to
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3827
3828
3829
3830
3831
   * occur.
   */
  int sysctl_min_unmapped_ratio = 1;
  
  /*
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3832
3833
3834
3835
   * If the number of slab pages in a zone grows beyond this percentage then
   * slab reclaim needs to occur.
   */
  int sysctl_min_slab_ratio = 5;
11fb99898   Mel Gorman   mm: move most fil...
3836
  static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
90afa5de6   Mel Gorman   vmscan: properly ...
3837
  {
11fb99898   Mel Gorman   mm: move most fil...
3838
3839
3840
  	unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
  	unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
  		node_page_state(pgdat, NR_ACTIVE_FILE);
90afa5de6   Mel Gorman   vmscan: properly ...
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
  
  	/*
  	 * It's possible for there to be more file mapped pages than
  	 * accounted for by the pages on the file LRU lists because
  	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
  	 */
  	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
a5f5f91da   Mel Gorman   mm: convert zone_...
3851
  static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
90afa5de6   Mel Gorman   vmscan: properly ...
3852
  {
d031a1579   Alexandru Moise   mm/vmscan.c: fix ...
3853
3854
  	unsigned long nr_pagecache_reclaimable;
  	unsigned long delta = 0;
90afa5de6   Mel Gorman   vmscan: properly ...
3855
3856
  
  	/*
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
3857
  	 * If RECLAIM_UNMAP is set, then all file pages are considered
90afa5de6   Mel Gorman   vmscan: properly ...
3858
  	 * potentially reclaimable. Otherwise, we have to worry about
11fb99898   Mel Gorman   mm: move most fil...
3859
  	 * pages like swapcache and node_unmapped_file_pages() provides
90afa5de6   Mel Gorman   vmscan: properly ...
3860
3861
  	 * a better estimate
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
3862
3863
  	if (node_reclaim_mode & RECLAIM_UNMAP)
  		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
90afa5de6   Mel Gorman   vmscan: properly ...
3864
  	else
a5f5f91da   Mel Gorman   mm: convert zone_...
3865
  		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
90afa5de6   Mel Gorman   vmscan: properly ...
3866
3867
  
  	/* If we can't clean pages, remove dirty pages from consideration */
a5f5f91da   Mel Gorman   mm: convert zone_...
3868
3869
  	if (!(node_reclaim_mode & RECLAIM_WRITE))
  		delta += node_page_state(pgdat, NR_FILE_DIRTY);
90afa5de6   Mel Gorman   vmscan: properly ...
3870
3871
3872
3873
3874
3875
3876
  
  	/* Watch for any possible underflows due to delta */
  	if (unlikely(delta > nr_pagecache_reclaimable))
  		delta = nr_pagecache_reclaimable;
  
  	return nr_pagecache_reclaimable - delta;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3877
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3878
   * Try to free up some pages from this node through reclaim.
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3879
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3880
  static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3881
  {
7fb2d46d3   Christoph Lameter   [PATCH] zone_recl...
3882
  	/* Minimum pages needed in order to stay on node */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
3883
  	const unsigned long nr_pages = 1 << order;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3884
3885
  	struct task_struct *p = current;
  	struct reclaim_state reclaim_state;
499118e96   Vlastimil Babka   mm: introduce mem...
3886
  	unsigned int noreclaim_flag;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3887
  	struct scan_control sc = {
62b726c1b   Andrew Morton   mm/vmscan.c:__zon...
3888
  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3889
  		.gfp_mask = current_gfp_context(gfp_mask),
bd2f6199c   Johannes Weiner   vmscan: respect h...
3890
  		.order = order,
a5f5f91da   Mel Gorman   mm: convert zone_...
3891
3892
3893
  		.priority = NODE_RECLAIM_PRIORITY,
  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3894
  		.may_swap = 1,
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3895
  		.reclaim_idx = gfp_zone(gfp_mask),
179e96395   Andrew Morton   [PATCH] vmscan: s...
3896
  	};
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3897

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3898
  	cond_resched();
93781325d   Omar Sandoval   lockdep: fix fs_r...
3899
  	fs_reclaim_acquire(sc.gfp_mask);
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
3900
  	/*
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
3901
  	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
3902
  	 * and we also need to be able to write out pages for RECLAIM_WRITE
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
3903
  	 * and RECLAIM_UNMAP.
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
3904
  	 */
499118e96   Vlastimil Babka   mm: introduce mem...
3905
3906
  	noreclaim_flag = memalloc_noreclaim_save();
  	p->flags |= PF_SWAPWRITE;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3907
3908
  	reclaim_state.reclaimed_slab = 0;
  	p->reclaim_state = &reclaim_state;
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
3909

a5f5f91da   Mel Gorman   mm: convert zone_...
3910
  	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3911
  		/*
894befec4   Andrey Ryabinin   mm/vmscan: update...
3912
  		 * Free memory by calling shrink node with increasing
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3913
3914
  		 * priorities until we have enough memory freed.
  		 */
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3915
  		do {
970a39a36   Mel Gorman   mm, vmscan: avoid...
3916
  			shrink_node(pgdat, &sc);
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3917
  		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3918
  	}
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
3919

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3920
  	p->reclaim_state = NULL;
499118e96   Vlastimil Babka   mm: introduce mem...
3921
3922
  	current->flags &= ~PF_SWAPWRITE;
  	memalloc_noreclaim_restore(noreclaim_flag);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3923
  	fs_reclaim_release(sc.gfp_mask);
a79311c14   Rik van Riel   vmscan: bail out ...
3924
  	return sc.nr_reclaimed >= nr_pages;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3925
  }
179e96395   Andrew Morton   [PATCH] vmscan: s...
3926

a5f5f91da   Mel Gorman   mm: convert zone_...
3927
  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
179e96395   Andrew Morton   [PATCH] vmscan: s...
3928
  {
d773ed6b8   David Rientjes   mm: test and set ...
3929
  	int ret;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3930
3931
  
  	/*
a5f5f91da   Mel Gorman   mm: convert zone_...
3932
  	 * Node reclaim reclaims unmapped file backed pages and
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3933
  	 * slab pages if we are over the defined limits.
34aa1330f   Christoph Lameter   [PATCH] zoned vm ...
3934
  	 *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3935
3936
  	 * A small portion of unmapped file backed pages is needed for
  	 * file I/O otherwise pages read by file I/O will be immediately
a5f5f91da   Mel Gorman   mm: convert zone_...
3937
3938
  	 * thrown out if the node is overallocated. So we do not reclaim
  	 * if less than a specified percentage of the node is used by
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3939
  	 * unmapped file backed pages.
179e96395   Andrew Morton   [PATCH] vmscan: s...
3940
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
3941
  	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
385386cff   Johannes Weiner   mm: vmstat: move ...
3942
  	    node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
a5f5f91da   Mel Gorman   mm: convert zone_...
3943
  		return NODE_RECLAIM_FULL;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3944
3945
  
  	/*
d773ed6b8   David Rientjes   mm: test and set ...
3946
  	 * Do not scan if the allocation should not be delayed.
179e96395   Andrew Morton   [PATCH] vmscan: s...
3947
  	 */
d0164adc8   Mel Gorman   mm, page_alloc: d...
3948
  	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
a5f5f91da   Mel Gorman   mm: convert zone_...
3949
  		return NODE_RECLAIM_NOSCAN;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3950
3951
  
  	/*
a5f5f91da   Mel Gorman   mm: convert zone_...
3952
  	 * Only run node reclaim on the local node or on nodes that do not
179e96395   Andrew Morton   [PATCH] vmscan: s...
3953
3954
3955
3956
  	 * have associated processors. This will favor the local processor
  	 * over remote processors and spread off node memory allocations
  	 * as wide as possible.
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
3957
3958
  	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
  		return NODE_RECLAIM_NOSCAN;
d773ed6b8   David Rientjes   mm: test and set ...
3959

a5f5f91da   Mel Gorman   mm: convert zone_...
3960
3961
  	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
  		return NODE_RECLAIM_NOSCAN;
fa5e084e4   Mel Gorman   vmscan: do not un...
3962

a5f5f91da   Mel Gorman   mm: convert zone_...
3963
3964
  	ret = __node_reclaim(pgdat, gfp_mask, order);
  	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
d773ed6b8   David Rientjes   mm: test and set ...
3965

24cf72518   Mel Gorman   vmscan: count the...
3966
3967
  	if (!ret)
  		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
d773ed6b8   David Rientjes   mm: test and set ...
3968
  	return ret;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3969
  }
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3970
  #endif
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3971

894bc3104   Lee Schermerhorn   Unevictable LRU I...
3972
3973
3974
  /*
   * page_evictable - test whether a page is evictable
   * @page: the page to test
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3975
3976
   *
   * Test whether page is evictable--i.e., should be placed on active/inactive
39b5f29ac   Hugh Dickins   mm: remove vma ar...
3977
   * lists vs unevictable list.
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3978
3979
   *
   * Reasons page might not be evictable:
ba9ddf493   Lee Schermerhorn   Ramfs and Ram Dis...
3980
   * (1) page's mapping marked unevictable
b291f0003   Nick Piggin   mlock: mlocked pa...
3981
   * (2) page is part of an mlocked VMA
ba9ddf493   Lee Schermerhorn   Ramfs and Ram Dis...
3982
   *
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3983
   */
39b5f29ac   Hugh Dickins   mm: remove vma ar...
3984
  int page_evictable(struct page *page)
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3985
  {
e92bb4dd9   Huang Ying   mm: fix races bet...
3986
3987
3988
3989
3990
3991
3992
  	int ret;
  
  	/* Prevent address_space of inode and swap cache from being freed */
  	rcu_read_lock();
  	ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
  	rcu_read_unlock();
  	return ret;
894bc3104   Lee Schermerhorn   Unevictable LRU I...
3993
  }
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
3994

85046579b   Hugh Dickins   SHM_UNLOCK: fix l...
3995
  #ifdef CONFIG_SHMEM
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
3996
  /**
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
3997
3998
3999
   * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
   * @pages:	array of pages to check
   * @nr_pages:	number of pages to check
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4000
   *
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4001
   * Checks pages for evictability and moves them to the appropriate lru list.
85046579b   Hugh Dickins   SHM_UNLOCK: fix l...
4002
4003
   *
   * This function is only used for SysV IPC SHM_UNLOCK.
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4004
   */
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4005
  void check_move_unevictable_pages(struct page **pages, int nr_pages)
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4006
  {
925b7673c   Johannes Weiner   mm: make per-memc...
4007
  	struct lruvec *lruvec;
785b99feb   Mel Gorman   mm, vmscan: relea...
4008
  	struct pglist_data *pgdat = NULL;
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4009
4010
4011
  	int pgscanned = 0;
  	int pgrescued = 0;
  	int i;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4012

245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4013
4014
  	for (i = 0; i < nr_pages; i++) {
  		struct page *page = pages[i];
785b99feb   Mel Gorman   mm, vmscan: relea...
4015
  		struct pglist_data *pagepgdat = page_pgdat(page);
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4016

245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4017
  		pgscanned++;
785b99feb   Mel Gorman   mm, vmscan: relea...
4018
4019
4020
4021
4022
  		if (pagepgdat != pgdat) {
  			if (pgdat)
  				spin_unlock_irq(&pgdat->lru_lock);
  			pgdat = pagepgdat;
  			spin_lock_irq(&pgdat->lru_lock);
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4023
  		}
785b99feb   Mel Gorman   mm, vmscan: relea...
4024
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4025

245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4026
4027
  		if (!PageLRU(page) || !PageUnevictable(page))
  			continue;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4028

39b5f29ac   Hugh Dickins   mm: remove vma ar...
4029
  		if (page_evictable(page)) {
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4030
  			enum lru_list lru = page_lru_base_type(page);
309381fea   Sasha Levin   mm: dump page whe...
4031
  			VM_BUG_ON_PAGE(PageActive(page), page);
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4032
  			ClearPageUnevictable(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
4033
4034
  			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
  			add_page_to_lru_list(page, lruvec, lru);
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4035
  			pgrescued++;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4036
  		}
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4037
  	}
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4038

785b99feb   Mel Gorman   mm, vmscan: relea...
4039
  	if (pgdat) {
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4040
4041
  		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
  		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
785b99feb   Mel Gorman   mm, vmscan: relea...
4042
  		spin_unlock_irq(&pgdat->lru_lock);
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4043
  	}
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4044
  }
85046579b   Hugh Dickins   SHM_UNLOCK: fix l...
4045
  #endif /* CONFIG_SHMEM */