Blame view

mm/vmscan.c 123 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
13
  /*
   *  linux/mm/vmscan.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *
   *  Swap reorganised 29.12.95, Stephen Tweedie.
   *  kswapd added: 7.1.96  sct
   *  Removed kswapd_ctl limits, and swap out as many pages as needed
   *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
   *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
b1de0d139   Mitchel Humpherys   mm: convert some ...
14
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
  #include <linux/mm.h>
5b3cc15af   Ingo Molnar   sched/headers: Pr...
16
  #include <linux/sched/mm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
17
  #include <linux/module.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
18
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
  #include <linux/pagemap.h>
  #include <linux/init.h>
  #include <linux/highmem.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
24
  #include <linux/vmpressure.h>
e129b5c23   Andrew Morton   [PATCH] vm: add p...
25
  #include <linux/vmstat.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
26
27
28
29
30
31
  #include <linux/file.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>	/* for try_to_release_page(),
  					buffer_heads_over_limit */
  #include <linux/mm_inline.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
35
36
  #include <linux/backing-dev.h>
  #include <linux/rmap.h>
  #include <linux/topology.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
3e7d34497   Mel Gorman   mm: vmscan: recla...
37
  #include <linux/compaction.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
39
  #include <linux/notifier.h>
  #include <linux/rwsem.h>
248a0301e   Rafael J. Wysocki   [PATCH] mm: make ...
40
  #include <linux/delay.h>
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
41
  #include <linux/kthread.h>
7dfb71030   Nigel Cunningham   [PATCH] Add inclu...
42
  #include <linux/freezer.h>
66e1707bc   Balbir Singh   Memory controller...
43
  #include <linux/memcontrol.h>
873b47717   Keika Kobayashi   per-task-delay-ac...
44
  #include <linux/delayacct.h>
af936a160   Lee Schermerhorn   vmscan: unevictab...
45
  #include <linux/sysctl.h>
929bea7c7   KOSAKI Motohiro   vmscan: all_unrec...
46
  #include <linux/oom.h>
64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
47
  #include <linux/pagevec.h>
268bb0ce3   Linus Torvalds   sanitize <linux/p...
48
  #include <linux/prefetch.h>
b1de0d139   Mitchel Humpherys   mm: convert some ...
49
  #include <linux/printk.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
50
  #include <linux/dax.h>
eb414681d   Johannes Weiner   psi: pressure sta...
51
  #include <linux/psi.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52
53
54
55
56
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
  
  #include <linux/swapops.h>
117aad1e9   Rafael Aquini   mm: avoid reinser...
57
  #include <linux/balloon_compaction.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
58

0f8053a50   Nick Piggin   [PATCH] mm: make ...
59
  #include "internal.h"
33906bc5c   Mel Gorman   vmscan: tracing: ...
60
61
  #define CREATE_TRACE_POINTS
  #include <trace/events/vmscan.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
  struct scan_control {
22fba3354   KOSAKI Motohiro   vmscan: separate ...
63
64
  	/* How many pages shrink_list() should reclaim */
  	unsigned long nr_to_reclaim;
ee814fe23   Johannes Weiner   mm: vmscan: clean...
65
66
67
68
69
  	/*
  	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
  	 * are scanned.
  	 */
  	nodemask_t	*nodemask;
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
70

5f53e7629   KOSAKI Motohiro   vmscan: page_chec...
71
  	/*
f16015fbf   Johannes Weiner   mm: vmscan: disti...
72
73
74
75
  	 * The memory cgroup that hit its limit and as a result is the
  	 * primary target of this reclaim invocation.
  	 */
  	struct mem_cgroup *target_mem_cgroup;
66e1707bc   Balbir Singh   Memory controller...
76

7cf111bc3   Johannes Weiner   mm: vmscan: deter...
77
78
79
80
81
  	/*
  	 * Scan pressure balancing between anon and file LRUs
  	 */
  	unsigned long	anon_cost;
  	unsigned long	file_cost;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
82
83
84
85
86
87
  	/* Can active pages be deactivated as part of reclaim? */
  #define DEACTIVATE_ANON 1
  #define DEACTIVATE_FILE 2
  	unsigned int may_deactivate:2;
  	unsigned int force_deactivate:1;
  	unsigned int skipped_deactivate:1;
1276ad68e   Johannes Weiner   mm: vmscan: scan ...
88
  	/* Writepage batching in laptop mode; RECLAIM_WRITE */
ee814fe23   Johannes Weiner   mm: vmscan: clean...
89
90
91
92
93
94
95
  	unsigned int may_writepage:1;
  
  	/* Can mapped pages be reclaimed? */
  	unsigned int may_unmap:1;
  
  	/* Can pages be swapped as part of reclaim? */
  	unsigned int may_swap:1;
d6622f636   Yisheng Xie   mm/vmscan: more r...
96
97
98
99
100
101
102
  	/*
  	 * Cgroups are not reclaimed below their configured memory.low,
  	 * unless we threaten to OOM. If any cgroups are skipped due to
  	 * memory.low and nothing was reclaimed, go back for memory.low.
  	 */
  	unsigned int memcg_low_reclaim:1;
  	unsigned int memcg_low_skipped:1;
241994ed8   Johannes Weiner   mm: memcontrol: d...
103

ee814fe23   Johannes Weiner   mm: vmscan: clean...
104
105
106
107
  	unsigned int hibernation_mode:1;
  
  	/* One of the zones is ready for compaction */
  	unsigned int compaction_ready:1;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
108
109
  	/* There is easily reclaimable cold cache in the current node */
  	unsigned int cache_trim_mode:1;
53138cea7   Johannes Weiner   mm: vmscan: move ...
110
111
  	/* The file pages on the current node are dangerously low */
  	unsigned int file_is_tiny:1;
bb451fdf3   Greg Thelen   mm/vmscan.c: cond...
112
113
114
115
116
117
118
119
120
121
122
  	/* Allocation order */
  	s8 order;
  
  	/* Scan (total_size >> priority) pages at once */
  	s8 priority;
  
  	/* The highest zone to isolate pages for reclaim from */
  	s8 reclaim_idx;
  
  	/* This context's GFP mask */
  	gfp_t gfp_mask;
ee814fe23   Johannes Weiner   mm: vmscan: clean...
123
124
125
126
127
  	/* Incremented by the number of inactive pages that were scanned */
  	unsigned long nr_scanned;
  
  	/* Number of pages freed so far during a call to shrink_zones() */
  	unsigned long nr_reclaimed;
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
128
129
130
131
132
133
134
135
136
137
  
  	struct {
  		unsigned int dirty;
  		unsigned int unqueued_dirty;
  		unsigned int congested;
  		unsigned int writeback;
  		unsigned int immediate;
  		unsigned int file_taken;
  		unsigned int taken;
  	} nr;
e5ca8071f   Yafang Shao   mm/vmscan.c: add ...
138
139
140
  
  	/* for recording the reclaimed slab by now */
  	struct reclaim_state reclaim_state;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
  #ifdef ARCH_HAS_PREFETCHW
  #define prefetchw_prev_lru_page(_page, _base, _field)			\
  	do {								\
  		if ((_page)->lru.prev != _base) {			\
  			struct page *prev;				\
  									\
  			prev = lru_to_page(&(_page->lru));		\
  			prefetchw(&prev->_field);			\
  		}							\
  	} while (0)
  #else
  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
  /*
c843966c5   Johannes Weiner   mm: allow swappin...
157
   * From 0 .. 200.  Higher means more swappy.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
158
159
   */
  int vm_swappiness = 60;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160

0a432dcbe   Yang Shi   mm: shrinker: mak...
161
162
163
164
165
166
167
168
169
170
171
  static void set_task_reclaim_state(struct task_struct *task,
  				   struct reclaim_state *rs)
  {
  	/* Check for an overwrite */
  	WARN_ON_ONCE(rs && task->reclaim_state);
  
  	/* Check for the nulling of an already-nulled member */
  	WARN_ON_ONCE(!rs && !task->reclaim_state);
  
  	task->reclaim_state = rs;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
172
173
  static LIST_HEAD(shrinker_list);
  static DECLARE_RWSEM(shrinker_rwsem);
0a432dcbe   Yang Shi   mm: shrinker: mak...
174
  #ifdef CONFIG_MEMCG
7e010df53   Kirill Tkhai   mm: use special v...
175
176
177
178
179
180
181
182
183
184
185
186
  /*
   * We allow subsystems to populate their shrinker-related
   * LRU lists before register_shrinker_prepared() is called
   * for the shrinker, since we don't want to impose
   * restrictions on their internal registration order.
   * In this case shrink_slab_memcg() may find corresponding
   * bit is set in the shrinkers map.
   *
   * This value is used by the function to detect registering
   * shrinkers and to skip do_shrink_slab() calls for them.
   */
  #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
b4c2b231c   Kirill Tkhai   mm: assign id to ...
187
188
189
190
191
192
193
194
195
  static DEFINE_IDR(shrinker_idr);
  static int shrinker_nr_max;
  
  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  {
  	int id, ret = -ENOMEM;
  
  	down_write(&shrinker_rwsem);
  	/* This may call shrinker, so it must use down_read_trylock() */
7e010df53   Kirill Tkhai   mm: use special v...
196
  	id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
b4c2b231c   Kirill Tkhai   mm: assign id to ...
197
198
  	if (id < 0)
  		goto unlock;
0a4465d34   Kirill Tkhai   mm, memcg: assign...
199
200
201
202
203
  	if (id >= shrinker_nr_max) {
  		if (memcg_expand_shrinker_maps(id)) {
  			idr_remove(&shrinker_idr, id);
  			goto unlock;
  		}
b4c2b231c   Kirill Tkhai   mm: assign id to ...
204
  		shrinker_nr_max = id + 1;
0a4465d34   Kirill Tkhai   mm, memcg: assign...
205
  	}
b4c2b231c   Kirill Tkhai   mm: assign id to ...
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
  	shrinker->id = id;
  	ret = 0;
  unlock:
  	up_write(&shrinker_rwsem);
  	return ret;
  }
  
  static void unregister_memcg_shrinker(struct shrinker *shrinker)
  {
  	int id = shrinker->id;
  
  	BUG_ON(id < 0);
  
  	down_write(&shrinker_rwsem);
  	idr_remove(&shrinker_idr, id);
  	up_write(&shrinker_rwsem);
  }
b4c2b231c   Kirill Tkhai   mm: assign id to ...
223

b5ead35e7   Johannes Weiner   mm: vmscan: namin...
224
  static bool cgroup_reclaim(struct scan_control *sc)
89b5fae53   Johannes Weiner   mm: vmscan: disti...
225
  {
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
226
  	return sc->target_mem_cgroup;
89b5fae53   Johannes Weiner   mm: vmscan: disti...
227
  }
97c9341f7   Tejun Heo   mm: vmscan: disab...
228
229
  
  /**
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
230
   * writeback_throttling_sane - is the usual dirty throttling mechanism available?
97c9341f7   Tejun Heo   mm: vmscan: disab...
231
232
233
234
235
236
237
238
239
240
241
   * @sc: scan_control in question
   *
   * The normal page dirty throttling mechanism in balance_dirty_pages() is
   * completely broken with the legacy memcg and direct stalling in
   * shrink_page_list() is used for throttling instead, which lacks all the
   * niceties such as fairness, adaptive pausing, bandwidth proportional
   * allocation and configurability.
   *
   * This function tests whether the vmscan currently in progress can assume
   * that the normal dirty throttling mechanism is operational.
   */
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
242
  static bool writeback_throttling_sane(struct scan_control *sc)
97c9341f7   Tejun Heo   mm: vmscan: disab...
243
  {
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
244
  	if (!cgroup_reclaim(sc))
97c9341f7   Tejun Heo   mm: vmscan: disab...
245
246
  		return true;
  #ifdef CONFIG_CGROUP_WRITEBACK
69234acee   Linus Torvalds   Merge branch 'for...
247
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
97c9341f7   Tejun Heo   mm: vmscan: disab...
248
249
250
251
  		return true;
  #endif
  	return false;
  }
91a45470f   KAMEZAWA Hiroyuki   per-zone and recl...
252
  #else
0a432dcbe   Yang Shi   mm: shrinker: mak...
253
254
255
256
257
258
259
260
  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  {
  	return 0;
  }
  
  static void unregister_memcg_shrinker(struct shrinker *shrinker)
  {
  }
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
261
  static bool cgroup_reclaim(struct scan_control *sc)
89b5fae53   Johannes Weiner   mm: vmscan: disti...
262
  {
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
263
  	return false;
89b5fae53   Johannes Weiner   mm: vmscan: disti...
264
  }
97c9341f7   Tejun Heo   mm: vmscan: disab...
265

b5ead35e7   Johannes Weiner   mm: vmscan: namin...
266
  static bool writeback_throttling_sane(struct scan_control *sc)
97c9341f7   Tejun Heo   mm: vmscan: disab...
267
268
269
  {
  	return true;
  }
91a45470f   KAMEZAWA Hiroyuki   per-zone and recl...
270
  #endif
5a1c84b40   Mel Gorman   mm: remove reclai...
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
  /*
   * This misses isolated pages which are not accounted for to save counters.
   * As the data only determines if reclaim or compaction continues, it is
   * not expected that isolated pages will be a dominating factor.
   */
  unsigned long zone_reclaimable_pages(struct zone *zone)
  {
  	unsigned long nr;
  
  	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
  		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
  	if (get_nr_swap_pages() > 0)
  		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
  			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  
  	return nr;
  }
fd5388037   Michal Hocko   mm, vmscan: clean...
288
289
290
291
292
293
294
  /**
   * lruvec_lru_size -  Returns the number of pages on the given LRU list.
   * @lruvec: lru vector
   * @lru: lru to use
   * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
   */
  unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
c9f299d98   KOSAKI Motohiro   mm: add zone nr_p...
295
  {
de3b01506   Johannes Weiner   mm: vmscan: simpl...
296
  	unsigned long size = 0;
fd5388037   Michal Hocko   mm, vmscan: clean...
297
  	int zid;
de3b01506   Johannes Weiner   mm: vmscan: simpl...
298
  	for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
fd5388037   Michal Hocko   mm, vmscan: clean...
299
  		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
c9f299d98   KOSAKI Motohiro   mm: add zone nr_p...
300

fd5388037   Michal Hocko   mm, vmscan: clean...
301
302
303
304
  		if (!managed_zone(zone))
  			continue;
  
  		if (!mem_cgroup_disabled())
de3b01506   Johannes Weiner   mm: vmscan: simpl...
305
  			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
fd5388037   Michal Hocko   mm, vmscan: clean...
306
  		else
de3b01506   Johannes Weiner   mm: vmscan: simpl...
307
  			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
fd5388037   Michal Hocko   mm, vmscan: clean...
308
  	}
de3b01506   Johannes Weiner   mm: vmscan: simpl...
309
  	return size;
b4536f0c8   Michal Hocko   mm, memcg: fix th...
310
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
  /*
1d3d4437e   Glauber Costa   vmscan: per-node ...
312
   * Add a shrinker callback to be called from the vm.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
313
   */
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
314
  int prealloc_shrinker(struct shrinker *shrinker)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
315
  {
b9726c26d   Alexey Dobriyan   numa: make "nr_no...
316
  	unsigned int size = sizeof(*shrinker->nr_deferred);
1d3d4437e   Glauber Costa   vmscan: per-node ...
317

1d3d4437e   Glauber Costa   vmscan: per-node ...
318
319
320
321
322
323
  	if (shrinker->flags & SHRINKER_NUMA_AWARE)
  		size *= nr_node_ids;
  
  	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
  	if (!shrinker->nr_deferred)
  		return -ENOMEM;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
324
325
326
327
328
  
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
  		if (prealloc_memcg_shrinker(shrinker))
  			goto free_deferred;
  	}
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
329
  	return 0;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
330
331
332
333
334
  
  free_deferred:
  	kfree(shrinker->nr_deferred);
  	shrinker->nr_deferred = NULL;
  	return -ENOMEM;
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
335
336
337
338
  }
  
  void free_prealloced_shrinker(struct shrinker *shrinker)
  {
b4c2b231c   Kirill Tkhai   mm: assign id to ...
339
340
341
342
343
  	if (!shrinker->nr_deferred)
  		return;
  
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		unregister_memcg_shrinker(shrinker);
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
344
345
346
  	kfree(shrinker->nr_deferred);
  	shrinker->nr_deferred = NULL;
  }
1d3d4437e   Glauber Costa   vmscan: per-node ...
347

8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
348
349
  void register_shrinker_prepared(struct shrinker *shrinker)
  {
8e1f936b7   Rusty Russell   mm: clean up and ...
350
351
  	down_write(&shrinker_rwsem);
  	list_add_tail(&shrinker->list, &shrinker_list);
42a9a53bb   Yang Shi   mm: vmscan: prote...
352
  #ifdef CONFIG_MEMCG
8df4a44cc   Kirill Tkhai   mm: check shrinke...
353
354
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		idr_replace(&shrinker_idr, shrinker, shrinker->id);
7e010df53   Kirill Tkhai   mm: use special v...
355
  #endif
8e1f936b7   Rusty Russell   mm: clean up and ...
356
  	up_write(&shrinker_rwsem);
8e04944f0   Tetsuo Handa   mm,vmscan: Allow ...
357
358
359
360
361
362
363
364
365
  }
  
  int register_shrinker(struct shrinker *shrinker)
  {
  	int err = prealloc_shrinker(shrinker);
  
  	if (err)
  		return err;
  	register_shrinker_prepared(shrinker);
1d3d4437e   Glauber Costa   vmscan: per-node ...
366
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
  }
8e1f936b7   Rusty Russell   mm: clean up and ...
368
  EXPORT_SYMBOL(register_shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
369
370
371
372
  
  /*
   * Remove one
   */
8e1f936b7   Rusty Russell   mm: clean up and ...
373
  void unregister_shrinker(struct shrinker *shrinker)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
374
  {
bb422a738   Tetsuo Handa   mm,vmscan: Make u...
375
376
  	if (!shrinker->nr_deferred)
  		return;
b4c2b231c   Kirill Tkhai   mm: assign id to ...
377
378
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  		unregister_memcg_shrinker(shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
379
380
381
  	down_write(&shrinker_rwsem);
  	list_del(&shrinker->list);
  	up_write(&shrinker_rwsem);
ae3933216   Andrew Vagin   mm/vmscan.c: don'...
382
  	kfree(shrinker->nr_deferred);
bb422a738   Tetsuo Handa   mm,vmscan: Make u...
383
  	shrinker->nr_deferred = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
384
  }
8e1f936b7   Rusty Russell   mm: clean up and ...
385
  EXPORT_SYMBOL(unregister_shrinker);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
386
387
  
  #define SHRINK_BATCH 128
1d3d4437e   Glauber Costa   vmscan: per-node ...
388

cb731d6c6   Vladimir Davydov   vmscan: per memor...
389
  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
9092c71bb   Josef Bacik   mm: use sc->prior...
390
  				    struct shrinker *shrinker, int priority)
1d3d4437e   Glauber Costa   vmscan: per-node ...
391
392
393
394
  {
  	unsigned long freed = 0;
  	unsigned long long delta;
  	long total_scan;
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
395
  	long freeable;
1d3d4437e   Glauber Costa   vmscan: per-node ...
396
397
398
399
400
  	long nr;
  	long new_nr;
  	int nid = shrinkctl->nid;
  	long batch_size = shrinker->batch ? shrinker->batch
  					  : SHRINK_BATCH;
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
401
  	long scanned = 0, next_deferred;
1d3d4437e   Glauber Costa   vmscan: per-node ...
402

ac7fb3ad2   Kirill Tkhai   mm/vmscan.c: move...
403
404
  	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  		nid = 0;
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
405
  	freeable = shrinker->count_objects(shrinker, shrinkctl);
9b996468c   Kirill Tkhai   mm: add SHRINK_EM...
406
407
  	if (freeable == 0 || freeable == SHRINK_EMPTY)
  		return freeable;
1d3d4437e   Glauber Costa   vmscan: per-node ...
408
409
410
411
412
413
414
415
416
  
  	/*
  	 * copy the current shrinker scan count into a local variable
  	 * and zero it so that other concurrent shrinker invocations
  	 * don't also do this scanning work.
  	 */
  	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
  
  	total_scan = nr;
4b85afbda   Johannes Weiner   mm: zero-seek shr...
417
418
419
420
421
422
423
424
425
426
427
428
  	if (shrinker->seeks) {
  		delta = freeable >> priority;
  		delta *= 4;
  		do_div(delta, shrinker->seeks);
  	} else {
  		/*
  		 * These objects don't require any IO to create. Trim
  		 * them aggressively under memory pressure to keep
  		 * them from causing refetches in the IO caches.
  		 */
  		delta = freeable / 2;
  	}
172b06c32   Roman Gushchin   mm: slowly shrink...
429

1d3d4437e   Glauber Costa   vmscan: per-node ...
430
431
  	total_scan += delta;
  	if (total_scan < 0) {
d75f773c8   Sakari Ailus   treewide: Switch ...
432
433
  		pr_err("shrink_slab: %pS negative objects to delete nr=%ld
  ",
a0b02131c   Dave Chinner   shrinker: Kill ol...
434
  		       shrinker->scan_objects, total_scan);
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
435
  		total_scan = freeable;
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
436
437
438
  		next_deferred = nr;
  	} else
  		next_deferred = total_scan;
1d3d4437e   Glauber Costa   vmscan: per-node ...
439
440
441
442
443
444
445
  
  	/*
  	 * We need to avoid excessive windup on filesystem shrinkers
  	 * due to large numbers of GFP_NOFS allocations causing the
  	 * shrinkers to return -1 all the time. This results in a large
  	 * nr being built up so when a shrink that can do some work
  	 * comes along it empties the entire cache due to nr >>>
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
446
  	 * freeable. This is bad for sustaining a working set in
1d3d4437e   Glauber Costa   vmscan: per-node ...
447
448
449
450
451
  	 * memory.
  	 *
  	 * Hence only allow the shrinker to scan the entire cache when
  	 * a large delta change is calculated directly.
  	 */
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
452
453
  	if (delta < freeable / 4)
  		total_scan = min(total_scan, freeable / 2);
1d3d4437e   Glauber Costa   vmscan: per-node ...
454
455
456
457
458
459
  
  	/*
  	 * Avoid risking looping forever due to too large nr value:
  	 * never try to free more than twice the estimate number of
  	 * freeable entries.
  	 */
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
460
461
  	if (total_scan > freeable * 2)
  		total_scan = freeable * 2;
1d3d4437e   Glauber Costa   vmscan: per-node ...
462
463
  
  	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
9092c71bb   Josef Bacik   mm: use sc->prior...
464
  				   freeable, delta, total_scan, priority);
1d3d4437e   Glauber Costa   vmscan: per-node ...
465

0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
466
467
468
469
470
471
472
473
474
475
476
  	/*
  	 * Normally, we should not scan less than batch_size objects in one
  	 * pass to avoid too frequent shrinker calls, but if the slab has less
  	 * than batch_size objects in total and we are really tight on memory,
  	 * we will try to reclaim all available objects, otherwise we can end
  	 * up failing allocations although there are plenty of reclaimable
  	 * objects spread over several slabs with usage less than the
  	 * batch_size.
  	 *
  	 * We detect the "tight on memory" situations by looking at the total
  	 * number of objects we want to scan (total_scan). If it is greater
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
477
  	 * than the total number of objects on slab (freeable), we must be
0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
478
479
480
481
  	 * scanning at high prio and therefore should try to reclaim as much as
  	 * possible.
  	 */
  	while (total_scan >= batch_size ||
d5bc5fd3f   Vladimir Davydov   mm: vmscan: shrin...
482
  	       total_scan >= freeable) {
a0b02131c   Dave Chinner   shrinker: Kill ol...
483
  		unsigned long ret;
0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
484
  		unsigned long nr_to_scan = min(batch_size, total_scan);
1d3d4437e   Glauber Costa   vmscan: per-node ...
485

0b1fb40a3   Vladimir Davydov   mm: vmscan: shrin...
486
  		shrinkctl->nr_to_scan = nr_to_scan;
d460acb5b   Chris Wilson   mm: track actual ...
487
  		shrinkctl->nr_scanned = nr_to_scan;
a0b02131c   Dave Chinner   shrinker: Kill ol...
488
489
490
491
  		ret = shrinker->scan_objects(shrinker, shrinkctl);
  		if (ret == SHRINK_STOP)
  			break;
  		freed += ret;
1d3d4437e   Glauber Costa   vmscan: per-node ...
492

d460acb5b   Chris Wilson   mm: track actual ...
493
494
495
  		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
  		total_scan -= shrinkctl->nr_scanned;
  		scanned += shrinkctl->nr_scanned;
1d3d4437e   Glauber Costa   vmscan: per-node ...
496
497
498
  
  		cond_resched();
  	}
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
499
500
501
502
  	if (next_deferred >= scanned)
  		next_deferred -= scanned;
  	else
  		next_deferred = 0;
1d3d4437e   Glauber Costa   vmscan: per-node ...
503
504
505
506
507
  	/*
  	 * move the unused scan count back into the shrinker in a
  	 * manner that handles concurrent updates. If we exhausted the
  	 * scan, there is no need to do an update.
  	 */
5f33a0803   Shaohua Li   mm/vmscan.c: set ...
508
509
  	if (next_deferred > 0)
  		new_nr = atomic_long_add_return(next_deferred,
1d3d4437e   Glauber Costa   vmscan: per-node ...
510
511
512
  						&shrinker->nr_deferred[nid]);
  	else
  		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
df9024a8c   Dave Hansen   mm: shrinker: add...
513
  	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
1d3d4437e   Glauber Costa   vmscan: per-node ...
514
  	return freed;
1495f230f   Ying Han   vmscan: change sh...
515
  }
0a432dcbe   Yang Shi   mm: shrinker: mak...
516
  #ifdef CONFIG_MEMCG
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
517
518
519
520
  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  			struct mem_cgroup *memcg, int priority)
  {
  	struct memcg_shrinker_map *map;
b8e57efa2   Kirill Tkhai   mm/vmscan.c: fix ...
521
522
  	unsigned long ret, freed = 0;
  	int i;
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
523

0a432dcbe   Yang Shi   mm: shrinker: mak...
524
  	if (!mem_cgroup_online(memcg))
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
  		return 0;
  
  	if (!down_read_trylock(&shrinker_rwsem))
  		return 0;
  
  	map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
  					true);
  	if (unlikely(!map))
  		goto unlock;
  
  	for_each_set_bit(i, map->map, shrinker_nr_max) {
  		struct shrink_control sc = {
  			.gfp_mask = gfp_mask,
  			.nid = nid,
  			.memcg = memcg,
  		};
  		struct shrinker *shrinker;
  
  		shrinker = idr_find(&shrinker_idr, i);
7e010df53   Kirill Tkhai   mm: use special v...
544
545
546
  		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
  			if (!shrinker)
  				clear_bit(i, map->map);
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
547
548
  			continue;
  		}
0a432dcbe   Yang Shi   mm: shrinker: mak...
549
550
551
552
  		/* Call non-slab shrinkers even though kmem is disabled */
  		if (!memcg_kmem_enabled() &&
  		    !(shrinker->flags & SHRINKER_NONSLAB))
  			continue;
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
553
  		ret = do_shrink_slab(&sc, shrinker, priority);
f90280d6b   Kirill Tkhai   mm/vmscan.c: clea...
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
  		if (ret == SHRINK_EMPTY) {
  			clear_bit(i, map->map);
  			/*
  			 * After the shrinker reported that it had no objects to
  			 * free, but before we cleared the corresponding bit in
  			 * the memcg shrinker map, a new object might have been
  			 * added. To make sure, we have the bit set in this
  			 * case, we invoke the shrinker one more time and reset
  			 * the bit if it reports that it is not empty anymore.
  			 * The memory barrier here pairs with the barrier in
  			 * memcg_set_shrinker_bit():
  			 *
  			 * list_lru_add()     shrink_slab_memcg()
  			 *   list_add_tail()    clear_bit()
  			 *   <MB>               <MB>
  			 *   set_bit()          do_shrink_slab()
  			 */
  			smp_mb__after_atomic();
  			ret = do_shrink_slab(&sc, shrinker, priority);
  			if (ret == SHRINK_EMPTY)
  				ret = 0;
  			else
  				memcg_set_shrinker_bit(memcg, nid, i);
  		}
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
578
579
580
581
582
583
584
585
586
587
588
  		freed += ret;
  
  		if (rwsem_is_contended(&shrinker_rwsem)) {
  			freed = freed ? : 1;
  			break;
  		}
  	}
  unlock:
  	up_read(&shrinker_rwsem);
  	return freed;
  }
0a432dcbe   Yang Shi   mm: shrinker: mak...
589
  #else /* CONFIG_MEMCG */
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
590
591
592
593
594
  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  			struct mem_cgroup *memcg, int priority)
  {
  	return 0;
  }
0a432dcbe   Yang Shi   mm: shrinker: mak...
595
  #endif /* CONFIG_MEMCG */
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
596

6b4f7799c   Johannes Weiner   mm: vmscan: invok...
597
  /**
cb731d6c6   Vladimir Davydov   vmscan: per memor...
598
   * shrink_slab - shrink slab caches
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
599
600
   * @gfp_mask: allocation context
   * @nid: node whose slab caches to target
cb731d6c6   Vladimir Davydov   vmscan: per memor...
601
   * @memcg: memory cgroup whose slab caches to target
9092c71bb   Josef Bacik   mm: use sc->prior...
602
   * @priority: the reclaim priority
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
603
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
604
   * Call the shrink functions to age shrinkable caches.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
605
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
606
607
   * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
   * unaware shrinkers will receive a node id of 0 instead.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
   *
aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
609
610
   * @memcg specifies the memory cgroup to target. Unaware shrinkers
   * are called only if it is the root cgroup.
cb731d6c6   Vladimir Davydov   vmscan: per memor...
611
   *
9092c71bb   Josef Bacik   mm: use sc->prior...
612
613
   * @priority is sc->priority, we take the number of objects and >> by priority
   * in order to get the scan target.
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
614
   *
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
615
   * Returns the number of reclaimed slab objects.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
616
   */
cb731d6c6   Vladimir Davydov   vmscan: per memor...
617
618
  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
  				 struct mem_cgroup *memcg,
9092c71bb   Josef Bacik   mm: use sc->prior...
619
  				 int priority)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
620
  {
b8e57efa2   Kirill Tkhai   mm/vmscan.c: fix ...
621
  	unsigned long ret, freed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
622
  	struct shrinker *shrinker;
fa1e512fa   Yang Shi   mm: vmscan: check...
623
624
625
626
627
628
629
630
  	/*
  	 * The root memcg might be allocated even though memcg is disabled
  	 * via "cgroup_disable=memory" boot parameter.  This could make
  	 * mem_cgroup_is_root() return false, then just run memcg slab
  	 * shrink, but skip global shrink.  This may result in premature
  	 * oom.
  	 */
  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
b0dedc49a   Kirill Tkhai   mm/vmscan.c: iter...
631
  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
632

e830c63a6   Tetsuo Handa   mm,vmscan: don't ...
633
  	if (!down_read_trylock(&shrinker_rwsem))
f06590bd7   Minchan Kim   mm: vmscan: corre...
634
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
635
636
  
  	list_for_each_entry(shrinker, &shrinker_list, list) {
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
637
638
639
  		struct shrink_control sc = {
  			.gfp_mask = gfp_mask,
  			.nid = nid,
cb731d6c6   Vladimir Davydov   vmscan: per memor...
640
  			.memcg = memcg,
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
641
  		};
ec97097bc   Vladimir Davydov   mm: vmscan: call ...
642

9b996468c   Kirill Tkhai   mm: add SHRINK_EM...
643
644
645
646
  		ret = do_shrink_slab(&sc, shrinker, priority);
  		if (ret == SHRINK_EMPTY)
  			ret = 0;
  		freed += ret;
e496612c5   Minchan Kim   mm: do not stall ...
647
648
  		/*
  		 * Bail out if someone want to register a new shrinker to
55b65a57c   Ethon Paul   mm/vmsan: fix som...
649
  		 * prevent the registration from being stalled for long periods
e496612c5   Minchan Kim   mm: do not stall ...
650
651
652
653
654
655
  		 * by parallel ongoing shrinking.
  		 */
  		if (rwsem_is_contended(&shrinker_rwsem)) {
  			freed = freed ? : 1;
  			break;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
656
  	}
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
657

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
  	up_read(&shrinker_rwsem);
f06590bd7   Minchan Kim   mm: vmscan: corre...
659
660
  out:
  	cond_resched();
24f7c6b98   Dave Chinner   mm: new shrinker API
661
  	return freed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
662
  }
cb731d6c6   Vladimir Davydov   vmscan: per memor...
663
664
665
666
667
668
  void drop_slab_node(int nid)
  {
  	unsigned long freed;
  
  	do {
  		struct mem_cgroup *memcg = NULL;
069c411de   Chunxin Zang   mm/vmscan: fix in...
669
670
  		if (fatal_signal_pending(current))
  			return;
cb731d6c6   Vladimir Davydov   vmscan: per memor...
671
  		freed = 0;
aeed1d325   Vladimir Davydov   mm/vmscan.c: gene...
672
  		memcg = mem_cgroup_iter(NULL, NULL, NULL);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
673
  		do {
9092c71bb   Josef Bacik   mm: use sc->prior...
674
  			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
cb731d6c6   Vladimir Davydov   vmscan: per memor...
675
676
677
678
679
680
681
682
683
684
685
  		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  	} while (freed > 10);
  }
  
  void drop_slab(void)
  {
  	int nid;
  
  	for_each_online_node(nid)
  		drop_slab_node(nid);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
686
687
  static inline int is_page_cache_freeable(struct page *page)
  {
ceddc3a52   Johannes Weiner   mm: document is_p...
688
689
  	/*
  	 * A freeable page cache page is referenced only by the caller
67891ffff   Matthew Wilcox   mm: Convert is_pa...
690
691
  	 * that isolated the page, the page cache and optional buffer
  	 * heads at page->private.
ceddc3a52   Johannes Weiner   mm: document is_p...
692
  	 */
3efe62e46   Matthew Wilcox (Oracle)   mm/vmscan: allow ...
693
  	int page_cache_pins = thp_nr_pages(page);
67891ffff   Matthew Wilcox   mm: Convert is_pa...
694
  	return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
695
  }
cb16556d9   Yang Shi   mm/vmscan.c: remo...
696
  static int may_write_to_inode(struct inode *inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
  {
930d91525   Christoph Lameter   [PATCH] Swap Migr...
698
  	if (current->flags & PF_SWAPWRITE)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
699
  		return 1;
703c27088   Tejun Heo   writeback: implem...
700
  	if (!inode_write_congested(inode))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
  		return 1;
703c27088   Tejun Heo   writeback: implem...
702
  	if (inode_to_bdi(inode) == current->backing_dev_info)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
  		return 1;
  	return 0;
  }
  
  /*
   * We detected a synchronous write error writing a page out.  Probably
   * -ENOSPC.  We need to propagate that into the address_space for a subsequent
   * fsync(), msync() or close().
   *
   * The tricky part is that after writepage we cannot touch the mapping: nothing
   * prevents it from being freed up.  But we have a ref on the page and once
   * that page is locked, the mapping is pinned.
   *
   * We're allowed to run sleeping lock_page() here because we know the caller has
   * __GFP_FS.
   */
  static void handle_write_error(struct address_space *mapping,
  				struct page *page, int error)
  {
7eaceacca   Jens Axboe   block: remove per...
722
  	lock_page(page);
3e9f45bd1   Guillaume Chazarain   Factor outstandin...
723
724
  	if (page_mapping(page) == mapping)
  		mapping_set_error(mapping, error);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
725
726
  	unlock_page(page);
  }
04e62a29b   Christoph Lameter   [PATCH] More page...
727
728
729
730
731
732
733
734
735
736
737
  /* possible outcome of pageout() */
  typedef enum {
  	/* failed to write page out, page is locked */
  	PAGE_KEEP,
  	/* move page to the active list, page is locked */
  	PAGE_ACTIVATE,
  	/* page has been sent to the disk successfully, page is unlocked */
  	PAGE_SUCCESS,
  	/* page is clean and locked */
  	PAGE_CLEAN,
  } pageout_t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
739
740
   * pageout is called by shrink_page_list() for each dirty page.
   * Calls ->writepage().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
741
   */
cb16556d9   Yang Shi   mm/vmscan.c: remo...
742
  static pageout_t pageout(struct page *page, struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
743
744
745
746
747
748
749
750
  {
  	/*
  	 * If the page is dirty, only perform writeback if that write
  	 * will be non-blocking.  To prevent this allocation from being
  	 * stalled by pagecache activity.  But note that there may be
  	 * stalls if we need to run get_block().  We could test
  	 * PagePrivate for that.
  	 *
8174202b3   Al Viro   write_iter varian...
751
  	 * If this process is currently in __generic_file_write_iter() against
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
752
753
754
755
756
757
758
  	 * this page's queue, we can perform writeback even if that
  	 * will block.
  	 *
  	 * If the page is swapcache, write it back even if that would
  	 * block, for some throttling. This happens by accident, because
  	 * swap_backing_dev_info is bust: it doesn't reflect the
  	 * congestion state of the swapdevs.  Easy to fix, if needed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
759
760
761
762
763
764
765
766
  	 */
  	if (!is_page_cache_freeable(page))
  		return PAGE_KEEP;
  	if (!mapping) {
  		/*
  		 * Some data journaling orphaned pages can have
  		 * page->mapping == NULL while being dirty with clean buffers.
  		 */
266cf658e   David Howells   FS-Cache: Recruit...
767
  		if (page_has_private(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
768
769
  			if (try_to_free_buffers(page)) {
  				ClearPageDirty(page);
b1de0d139   Mitchel Humpherys   mm: convert some ...
770
771
  				pr_info("%s: orphaned page
  ", __func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
772
773
774
775
776
777
778
  				return PAGE_CLEAN;
  			}
  		}
  		return PAGE_KEEP;
  	}
  	if (mapping->a_ops->writepage == NULL)
  		return PAGE_ACTIVATE;
cb16556d9   Yang Shi   mm/vmscan.c: remo...
779
  	if (!may_write_to_inode(mapping->host))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
780
781
782
783
784
785
786
  		return PAGE_KEEP;
  
  	if (clear_page_dirty_for_io(page)) {
  		int res;
  		struct writeback_control wbc = {
  			.sync_mode = WB_SYNC_NONE,
  			.nr_to_write = SWAP_CLUSTER_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
787
788
  			.range_start = 0,
  			.range_end = LLONG_MAX,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
789
790
791
792
793
794
795
  			.for_reclaim = 1,
  		};
  
  		SetPageReclaim(page);
  		res = mapping->a_ops->writepage(page, &wbc);
  		if (res < 0)
  			handle_write_error(mapping, page, res);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
796
  		if (res == AOP_WRITEPAGE_ACTIVATE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
797
798
799
  			ClearPageReclaim(page);
  			return PAGE_ACTIVATE;
  		}
c661b078f   Andy Whitcroft   synchronous lumpy...
800

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
801
802
803
804
  		if (!PageWriteback(page)) {
  			/* synchronous write or broken a_ops? */
  			ClearPageReclaim(page);
  		}
3aa238511   yalin wang   mm/vmscan.c: chan...
805
  		trace_mm_vmscan_writepage(page);
c4a25635b   Mel Gorman   mm: move vmscan w...
806
  		inc_node_page_state(page, NR_VMSCAN_WRITE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
807
808
809
810
811
  		return PAGE_SUCCESS;
  	}
  
  	return PAGE_CLEAN;
  }
a649fd927   Andrew Morton   [PATCH] invalidat...
812
  /*
e286781d5   Nick Piggin   mm: speculative p...
813
814
   * Same as remove_mapping, but if the page is removed from the mapping, it
   * gets returned with a refcount of 0.
a649fd927   Andrew Morton   [PATCH] invalidat...
815
   */
a528910e1   Johannes Weiner   mm: thrash detect...
816
  static int __remove_mapping(struct address_space *mapping, struct page *page,
b910718a9   Johannes Weiner   mm: vmscan: detec...
817
  			    bool reclaimed, struct mem_cgroup *target_memcg)
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
818
  {
c4843a759   Greg Thelen   memcg: add per cg...
819
  	unsigned long flags;
bd4c82c22   Huang Ying   mm, THP, swap: de...
820
  	int refcount;
aae466b00   Joonsoo Kim   mm/swap: implemen...
821
  	void *shadow = NULL;
c4843a759   Greg Thelen   memcg: add per cg...
822

28e4d965e   Nick Piggin   [PATCH] mm: remov...
823
824
  	BUG_ON(!PageLocked(page));
  	BUG_ON(mapping != page_mapping(page));
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
825

b93b01631   Matthew Wilcox   page cache: use x...
826
  	xa_lock_irqsave(&mapping->i_pages, flags);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
827
  	/*
0fd0e6b05   Nick Piggin   [PATCH] page inva...
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
  	 * The non racy check for a busy page.
  	 *
  	 * Must be careful with the order of the tests. When someone has
  	 * a ref to the page, it may be possible that they dirty it then
  	 * drop the reference. So if PageDirty is tested before page_count
  	 * here, then the following race may occur:
  	 *
  	 * get_user_pages(&page);
  	 * [user mapping goes away]
  	 * write_to(page);
  	 *				!PageDirty(page)    [good]
  	 * SetPageDirty(page);
  	 * put_page(page);
  	 *				!page_count(page)   [good, discard it]
  	 *
  	 * [oops, our write_to data is lost]
  	 *
  	 * Reversing the order of the tests ensures such a situation cannot
  	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
0139aa7b7   Joonsoo Kim   mm: rename _count...
847
  	 * load is not satisfied before that of page->_refcount.
0fd0e6b05   Nick Piggin   [PATCH] page inva...
848
849
  	 *
  	 * Note that if SetPageDirty is always performed via set_page_dirty,
b93b01631   Matthew Wilcox   page cache: use x...
850
  	 * and thus under the i_pages lock, then this ordering is not required.
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
851
  	 */
906d278d7   William Kucharski   mm/vmscan.c: supp...
852
  	refcount = 1 + compound_nr(page);
bd4c82c22   Huang Ying   mm, THP, swap: de...
853
  	if (!page_ref_freeze(page, refcount))
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
854
  		goto cannot_free;
1c4c3b99c   Jiang Biao   mm: fix page_free...
855
  	/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
e286781d5   Nick Piggin   mm: speculative p...
856
  	if (unlikely(PageDirty(page))) {
bd4c82c22   Huang Ying   mm, THP, swap: de...
857
  		page_ref_unfreeze(page, refcount);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
858
  		goto cannot_free;
e286781d5   Nick Piggin   mm: speculative p...
859
  	}
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
860
861
862
  
  	if (PageSwapCache(page)) {
  		swp_entry_t swap = { .val = page_private(page) };
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
863
  		mem_cgroup_swapout(page, swap);
aae466b00   Joonsoo Kim   mm/swap: implemen...
864
865
866
  		if (reclaimed && !mapping_exiting(mapping))
  			shadow = workingset_eviction(page, target_memcg);
  		__delete_from_swap_cache(page, swap, shadow);
b93b01631   Matthew Wilcox   page cache: use x...
867
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
75f6d6d29   Minchan Kim   mm, THP, swap: un...
868
  		put_swap_page(page, swap);
e286781d5   Nick Piggin   mm: speculative p...
869
  	} else {
6072d13c4   Linus Torvalds   Call the filesyst...
870
871
872
  		void (*freepage)(struct page *);
  
  		freepage = mapping->a_ops->freepage;
a528910e1   Johannes Weiner   mm: thrash detect...
873
874
875
876
877
  		/*
  		 * Remember a shadow entry for reclaimed file cache in
  		 * order to detect refaults, thus thrashing, later on.
  		 *
  		 * But don't store shadows in an address space that is
238c30468   dylan-meiners   mm/vmscan.c: fix ...
878
  		 * already exiting.  This is not just an optimization,
a528910e1   Johannes Weiner   mm: thrash detect...
879
880
881
  		 * inode reclaim needs to empty out the radix tree or
  		 * the nodes are lost.  Don't plant shadows behind its
  		 * back.
f9fe48bec   Ross Zwisler   dax: support dirt...
882
883
884
885
886
  		 *
  		 * We also don't store shadows for DAX mappings because the
  		 * only page cache pages found in these are zero pages
  		 * covering holes, and because we don't want to mix DAX
  		 * exceptional entries and shadow exceptional entries in the
b93b01631   Matthew Wilcox   page cache: use x...
887
  		 * same address_space.
a528910e1   Johannes Weiner   mm: thrash detect...
888
  		 */
9de4f22a6   Huang Ying   mm: code cleanup ...
889
  		if (reclaimed && page_is_file_lru(page) &&
f9fe48bec   Ross Zwisler   dax: support dirt...
890
  		    !mapping_exiting(mapping) && !dax_mapping(mapping))
b910718a9   Johannes Weiner   mm: vmscan: detec...
891
  			shadow = workingset_eviction(page, target_memcg);
62cccb8c8   Johannes Weiner   mm: simplify lock...
892
  		__delete_from_page_cache(page, shadow);
b93b01631   Matthew Wilcox   page cache: use x...
893
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
894
895
896
  
  		if (freepage != NULL)
  			freepage(page);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
897
  	}
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
898
899
900
  	return 1;
  
  cannot_free:
b93b01631   Matthew Wilcox   page cache: use x...
901
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
902
903
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  /*
e286781d5   Nick Piggin   mm: speculative p...
905
906
907
908
909
910
911
   * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
   * someone else has a ref on the page, abort and return 0.  If it was
   * successfully detached, return 1.  Assumes the caller has a single ref on
   * this page.
   */
  int remove_mapping(struct address_space *mapping, struct page *page)
  {
b910718a9   Johannes Weiner   mm: vmscan: detec...
912
  	if (__remove_mapping(mapping, page, false, NULL)) {
e286781d5   Nick Piggin   mm: speculative p...
913
914
915
916
917
  		/*
  		 * Unfreezing the refcount with 1 rather than 2 effectively
  		 * drops the pagecache ref for us without requiring another
  		 * atomic operation.
  		 */
fe896d187   Joonsoo Kim   mm: introduce pag...
918
  		page_ref_unfreeze(page, 1);
e286781d5   Nick Piggin   mm: speculative p...
919
920
921
922
  		return 1;
  	}
  	return 0;
  }
894bc3104   Lee Schermerhorn   Unevictable LRU I...
923
924
925
926
927
928
929
930
931
  /**
   * putback_lru_page - put previously isolated page onto appropriate LRU list
   * @page: page to be put back to appropriate lru list
   *
   * Add previously isolated @page to appropriate LRU list.
   * Page may still be unevictable for other reasons.
   *
   * lru_lock must not be held, interrupts must be enabled.
   */
894bc3104   Lee Schermerhorn   Unevictable LRU I...
932
933
  void putback_lru_page(struct page *page)
  {
9c4e6b1a7   Shakeel Butt   mm, mlock, vmscan...
934
  	lru_cache_add(page);
894bc3104   Lee Schermerhorn   Unevictable LRU I...
935
936
  	put_page(page);		/* drop ref from isolate */
  }
dfc8d636c   Johannes Weiner   vmscan: factor ou...
937
938
939
  enum page_references {
  	PAGEREF_RECLAIM,
  	PAGEREF_RECLAIM_CLEAN,
645747462   Johannes Weiner   vmscan: detect ma...
940
  	PAGEREF_KEEP,
dfc8d636c   Johannes Weiner   vmscan: factor ou...
941
942
943
944
945
946
  	PAGEREF_ACTIVATE,
  };
  
  static enum page_references page_check_references(struct page *page,
  						  struct scan_control *sc)
  {
645747462   Johannes Weiner   vmscan: detect ma...
947
  	int referenced_ptes, referenced_page;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
948
  	unsigned long vm_flags;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
949

c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
950
951
  	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
  					  &vm_flags);
645747462   Johannes Weiner   vmscan: detect ma...
952
  	referenced_page = TestClearPageReferenced(page);
dfc8d636c   Johannes Weiner   vmscan: factor ou...
953

dfc8d636c   Johannes Weiner   vmscan: factor ou...
954
955
956
957
958
959
  	/*
  	 * Mlock lost the isolation race with us.  Let try_to_unmap()
  	 * move the page to the unevictable list.
  	 */
  	if (vm_flags & VM_LOCKED)
  		return PAGEREF_RECLAIM;
645747462   Johannes Weiner   vmscan: detect ma...
960
  	if (referenced_ptes) {
645747462   Johannes Weiner   vmscan: detect ma...
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
  		/*
  		 * All mapped pages start out with page table
  		 * references from the instantiating fault, so we need
  		 * to look twice if a mapped file page is used more
  		 * than once.
  		 *
  		 * Mark it and spare it for another trip around the
  		 * inactive list.  Another page table reference will
  		 * lead to its activation.
  		 *
  		 * Note: the mark is set for activated pages as well
  		 * so that recently deactivated but used pages are
  		 * quickly recovered.
  		 */
  		SetPageReferenced(page);
34dbc67a6   Konstantin Khlebnikov   vmscan: promote s...
976
  		if (referenced_page || referenced_ptes > 1)
645747462   Johannes Weiner   vmscan: detect ma...
977
  			return PAGEREF_ACTIVATE;
c909e9936   Konstantin Khlebnikov   vmscan: activate ...
978
979
980
  		/*
  		 * Activate file-backed executable pages after first usage.
  		 */
b518154e5   Joonsoo Kim   mm/vmscan: protec...
981
  		if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
c909e9936   Konstantin Khlebnikov   vmscan: activate ...
982
  			return PAGEREF_ACTIVATE;
645747462   Johannes Weiner   vmscan: detect ma...
983
984
  		return PAGEREF_KEEP;
  	}
dfc8d636c   Johannes Weiner   vmscan: factor ou...
985
986
  
  	/* Reclaim if clean, defer dirty pages to writeback */
2e30244a7   KOSAKI Motohiro   vmscan,tmpfs: tre...
987
  	if (referenced_page && !PageSwapBacked(page))
645747462   Johannes Weiner   vmscan: detect ma...
988
989
990
  		return PAGEREF_RECLAIM_CLEAN;
  
  	return PAGEREF_RECLAIM;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
991
  }
e2be15f6c   Mel Gorman   mm: vmscan: stall...
992
993
994
995
  /* Check if a page is dirty or under writeback */
  static void page_check_dirty_writeback(struct page *page,
  				       bool *dirty, bool *writeback)
  {
b45972265   Mel Gorman   mm: vmscan: take ...
996
  	struct address_space *mapping;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
997
998
999
1000
  	/*
  	 * Anonymous pages are not handled by flushers and must be written
  	 * from reclaim context. Do not stall reclaim based on them
  	 */
9de4f22a6   Huang Ying   mm: code cleanup ...
1001
  	if (!page_is_file_lru(page) ||
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1002
  	    (PageAnon(page) && !PageSwapBacked(page))) {
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1003
1004
1005
1006
1007
1008
1009
1010
  		*dirty = false;
  		*writeback = false;
  		return;
  	}
  
  	/* By default assume that the page flags are accurate */
  	*dirty = PageDirty(page);
  	*writeback = PageWriteback(page);
b45972265   Mel Gorman   mm: vmscan: take ...
1011
1012
1013
1014
1015
1016
1017
1018
  
  	/* Verify dirty/writeback state if the filesystem supports it */
  	if (!page_has_private(page))
  		return;
  
  	mapping = page_mapping(page);
  	if (mapping && mapping->a_ops->is_dirty_writeback)
  		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1019
  }
e286781d5   Nick Piggin   mm: speculative p...
1020
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
1021
   * shrink_page_list() returns the number of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022
   */
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1023
1024
1025
  static unsigned int shrink_page_list(struct list_head *page_list,
  				     struct pglist_data *pgdat,
  				     struct scan_control *sc,
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1026
1027
  				     struct reclaim_stat *stat,
  				     bool ignore_references)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1028
1029
  {
  	LIST_HEAD(ret_pages);
abe4c3b50   Mel Gorman   vmscan: set up pa...
1030
  	LIST_HEAD(free_pages);
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1031
1032
  	unsigned int nr_reclaimed = 0;
  	unsigned int pgactivate = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1033

060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1034
  	memset(stat, 0, sizeof(*stat));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1035
  	cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036
1037
1038
  	while (!list_empty(page_list)) {
  		struct address_space *mapping;
  		struct page *page;
8940b34a4   Minchan Kim   mm: change PAGERE...
1039
  		enum page_references references = PAGEREF_RECLAIM;
4b7930626   Kirill Tkhai   mm/vmscan.c: make...
1040
  		bool dirty, writeback, may_enter_fs;
98879b3b9   Yang Shi   mm: vmscan: corre...
1041
  		unsigned int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1042
1043
1044
1045
1046
  
  		cond_resched();
  
  		page = lru_to_page(page_list);
  		list_del(&page->lru);
529ae9aaa   Nick Piggin   mm: rename page t...
1047
  		if (!trylock_page(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1048
  			goto keep;
309381fea   Sasha Levin   mm: dump page whe...
1049
  		VM_BUG_ON_PAGE(PageActive(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1050

d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
1051
  		nr_pages = compound_nr(page);
98879b3b9   Yang Shi   mm: vmscan: corre...
1052
1053
1054
  
  		/* Account the number of base pages even though THP */
  		sc->nr_scanned += nr_pages;
80e434260   Christoph Lameter   [PATCH] zone recl...
1055

39b5f29ac   Hugh Dickins   mm: remove vma ar...
1056
  		if (unlikely(!page_evictable(page)))
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1057
  			goto activate_locked;
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1058

a6dc60f89   Johannes Weiner   vmscan: rename sc...
1059
  		if (!sc->may_unmap && page_mapped(page))
80e434260   Christoph Lameter   [PATCH] zone recl...
1060
  			goto keep_locked;
c661b078f   Andy Whitcroft   synchronous lumpy...
1061
1062
  		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
283aba9f9   Mel Gorman   mm: vmscan: block...
1063
  		/*
894befec4   Andrey Ryabinin   mm/vmscan: update...
1064
  		 * The number of dirty pages determines if a node is marked
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1065
1066
1067
1068
1069
1070
  		 * reclaim_congested which affects wait_iff_congested. kswapd
  		 * will stall and start writing pages if the tail of the LRU
  		 * is all dirty unqueued pages.
  		 */
  		page_check_dirty_writeback(page, &dirty, &writeback);
  		if (dirty || writeback)
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1071
  			stat->nr_dirty++;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1072
1073
  
  		if (dirty && !writeback)
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1074
  			stat->nr_unqueued_dirty++;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1075

d04e8acd0   Mel Gorman   mm: vmscan: treat...
1076
1077
1078
1079
1080
1081
  		/*
  		 * Treat this page as congested if the underlying BDI is or if
  		 * pages are cycling through the LRU so quickly that the
  		 * pages marked for immediate reclaim are making it to the
  		 * end of the LRU a second time.
  		 */
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1082
  		mapping = page_mapping(page);
1da58ee2a   Jamie Liu   mm: vmscan: count...
1083
  		if (((dirty || writeback) && mapping &&
703c27088   Tejun Heo   writeback: implem...
1084
  		     inode_write_congested(mapping->host)) ||
d04e8acd0   Mel Gorman   mm: vmscan: treat...
1085
  		    (writeback && PageReclaim(page)))
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1086
  			stat->nr_congested++;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1087
1088
  
  		/*
283aba9f9   Mel Gorman   mm: vmscan: block...
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
  		 * If a page at the tail of the LRU is under writeback, there
  		 * are three cases to consider.
  		 *
  		 * 1) If reclaim is encountering an excessive number of pages
  		 *    under writeback and this page is both under writeback and
  		 *    PageReclaim then it indicates that pages are being queued
  		 *    for IO but are being recycled through the LRU before the
  		 *    IO can complete. Waiting on the page itself risks an
  		 *    indefinite stall if it is impossible to writeback the
  		 *    page due to IO error or disconnected storage so instead
b1a6f21e3   Mel Gorman   mm: vmscan: stall...
1099
1100
  		 *    note that the LRU is being scanned too quickly and the
  		 *    caller can stall after page list has been processed.
283aba9f9   Mel Gorman   mm: vmscan: block...
1101
  		 *
97c9341f7   Tejun Heo   mm: vmscan: disab...
1102
  		 * 2) Global or new memcg reclaim encounters a page that is
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1103
1104
1105
  		 *    not marked for immediate reclaim, or the caller does not
  		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
  		 *    not to fs). In this case mark the page for immediate
97c9341f7   Tejun Heo   mm: vmscan: disab...
1106
  		 *    reclaim and continue scanning.
283aba9f9   Mel Gorman   mm: vmscan: block...
1107
  		 *
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1108
1109
  		 *    Require may_enter_fs because we would wait on fs, which
  		 *    may not have submitted IO yet. And the loop driver might
283aba9f9   Mel Gorman   mm: vmscan: block...
1110
1111
1112
1113
1114
  		 *    enter reclaim, and deadlock if it waits on a page for
  		 *    which it is needed to do the write (loop masks off
  		 *    __GFP_IO|__GFP_FS for this reason); but more thought
  		 *    would probably show more reasons.
  		 *
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1115
  		 * 3) Legacy memcg encounters a page that is already marked
283aba9f9   Mel Gorman   mm: vmscan: block...
1116
1117
1118
1119
  		 *    PageReclaim. memcg does not have any dirty pages
  		 *    throttling so we could easily OOM just because too many
  		 *    pages are in writeback and there is nothing else to
  		 *    reclaim. Wait for the writeback to complete.
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1120
1121
1122
1123
1124
1125
1126
1127
1128
  		 *
  		 * In cases 1) and 2) we activate the pages to get them out of
  		 * the way while we continue scanning for clean pages on the
  		 * inactive list and refilling from the active list. The
  		 * observation here is that waiting for disk writes is more
  		 * expensive than potentially causing reloads down the line.
  		 * Since they're marked for immediate reclaim, they won't put
  		 * memory pressure on the cache working set any longer than it
  		 * takes to write them to disk.
283aba9f9   Mel Gorman   mm: vmscan: block...
1129
  		 */
c661b078f   Andy Whitcroft   synchronous lumpy...
1130
  		if (PageWriteback(page)) {
283aba9f9   Mel Gorman   mm: vmscan: block...
1131
1132
1133
  			/* Case 1 above */
  			if (current_is_kswapd() &&
  			    PageReclaim(page) &&
599d0c954   Mel Gorman   mm, vmscan: move ...
1134
  			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1135
  				stat->nr_immediate++;
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1136
  				goto activate_locked;
283aba9f9   Mel Gorman   mm: vmscan: block...
1137
1138
  
  			/* Case 2 above */
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
1139
  			} else if (writeback_throttling_sane(sc) ||
ecf5fc6e9   Michal Hocko   mm, vmscan: Do no...
1140
  			    !PageReclaim(page) || !may_enter_fs) {
c3b94f44f   Hugh Dickins   memcg: further pr...
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
  				/*
  				 * This is slightly racy - end_page_writeback()
  				 * might have just cleared PageReclaim, then
  				 * setting PageReclaim here end up interpreted
  				 * as PageReadahead - but that does not matter
  				 * enough to care.  What we do want is for this
  				 * page to have PageReclaim set next time memcg
  				 * reclaim reaches the tests above, so it will
  				 * then wait_on_page_writeback() to avoid OOM;
  				 * and it's also appropriate in global reclaim.
  				 */
  				SetPageReclaim(page);
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1153
  				stat->nr_writeback++;
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1154
  				goto activate_locked;
283aba9f9   Mel Gorman   mm: vmscan: block...
1155
1156
1157
  
  			/* Case 3 above */
  			} else {
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1158
  				unlock_page(page);
283aba9f9   Mel Gorman   mm: vmscan: block...
1159
  				wait_on_page_writeback(page);
7fadc8202   Hugh Dickins   mm, vmscan: unloc...
1160
1161
1162
  				/* then go back and try same page again */
  				list_add_tail(&page->lru, page_list);
  				continue;
e62e384e9   Michal Hocko   memcg: prevent OO...
1163
  			}
c661b078f   Andy Whitcroft   synchronous lumpy...
1164
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165

8940b34a4   Minchan Kim   mm: change PAGERE...
1166
  		if (!ignore_references)
02c6de8d7   Minchan Kim   mm: cma: discard ...
1167
  			references = page_check_references(page, sc);
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1168
1169
  		switch (references) {
  		case PAGEREF_ACTIVATE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
  			goto activate_locked;
645747462   Johannes Weiner   vmscan: detect ma...
1171
  		case PAGEREF_KEEP:
98879b3b9   Yang Shi   mm: vmscan: corre...
1172
  			stat->nr_ref_keep += nr_pages;
645747462   Johannes Weiner   vmscan: detect ma...
1173
  			goto keep_locked;
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1174
1175
1176
1177
  		case PAGEREF_RECLAIM:
  		case PAGEREF_RECLAIM_CLEAN:
  			; /* try to reclaim the page below */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1178

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1179
1180
1181
  		/*
  		 * Anonymous process memory has backing store?
  		 * Try to allocate it some swap space here.
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1182
  		 * Lazyfree page could be freed directly
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1183
  		 */
bd4c82c22   Huang Ying   mm, THP, swap: de...
1184
1185
1186
1187
  		if (PageAnon(page) && PageSwapBacked(page)) {
  			if (!PageSwapCache(page)) {
  				if (!(sc->gfp_mask & __GFP_IO))
  					goto keep_locked;
72c5ce894   Linus Torvalds   mm: don't put pin...
1188
1189
  				if (page_maybe_dma_pinned(page))
  					goto keep_locked;
bd4c82c22   Huang Ying   mm, THP, swap: de...
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
  				if (PageTransHuge(page)) {
  					/* cannot split THP, skip it */
  					if (!can_split_huge_page(page, NULL))
  						goto activate_locked;
  					/*
  					 * Split pages without a PMD map right
  					 * away. Chances are some or all of the
  					 * tail pages can be freed without IO.
  					 */
  					if (!compound_mapcount(page) &&
  					    split_huge_page_to_list(page,
  								    page_list))
  						goto activate_locked;
  				}
  				if (!add_to_swap(page)) {
  					if (!PageTransHuge(page))
98879b3b9   Yang Shi   mm: vmscan: corre...
1206
  						goto activate_locked_split;
bd4c82c22   Huang Ying   mm, THP, swap: de...
1207
1208
1209
1210
  					/* Fallback to swap normal pages */
  					if (split_huge_page_to_list(page,
  								    page_list))
  						goto activate_locked;
fe490cc0f   Huang Ying   mm, THP, swap: ad...
1211
1212
1213
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  					count_vm_event(THP_SWPOUT_FALLBACK);
  #endif
bd4c82c22   Huang Ying   mm, THP, swap: de...
1214
  					if (!add_to_swap(page))
98879b3b9   Yang Shi   mm: vmscan: corre...
1215
  						goto activate_locked_split;
bd4c82c22   Huang Ying   mm, THP, swap: de...
1216
  				}
0f0746589   Minchan Kim   mm, THP, swap: mo...
1217

4b7930626   Kirill Tkhai   mm/vmscan.c: make...
1218
  				may_enter_fs = true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1219

bd4c82c22   Huang Ying   mm, THP, swap: de...
1220
1221
1222
  				/* Adding to swap updated mapping */
  				mapping = page_mapping(page);
  			}
7751b2da6   Kirill A. Shutemov   vmscan: split fil...
1223
1224
1225
1226
  		} else if (unlikely(PageTransHuge(page))) {
  			/* Split file THP */
  			if (split_huge_page_to_list(page, page_list))
  				goto keep_locked;
e2be15f6c   Mel Gorman   mm: vmscan: stall...
1227
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228
1229
  
  		/*
98879b3b9   Yang Shi   mm: vmscan: corre...
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
  		 * THP may get split above, need minus tail pages and update
  		 * nr_pages to avoid accounting tail pages twice.
  		 *
  		 * The tail pages that are added into swap cache successfully
  		 * reach here.
  		 */
  		if ((nr_pages > 1) && !PageTransHuge(page)) {
  			sc->nr_scanned -= (nr_pages - 1);
  			nr_pages = 1;
  		}
  
  		/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1242
1243
1244
  		 * The page is mapped into the page tables of one or more
  		 * processes. Try to unmap it here.
  		 */
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1245
  		if (page_mapped(page)) {
dd156e3fc   Shakeel Butt   mm/rmap: always d...
1246
  			enum ttu_flags flags = TTU_BATCH_FLUSH;
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1247
  			bool was_swapbacked = PageSwapBacked(page);
bd4c82c22   Huang Ying   mm, THP, swap: de...
1248
1249
1250
  
  			if (unlikely(PageTransHuge(page)))
  				flags |= TTU_SPLIT_HUGE_PMD;
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1251

bd4c82c22   Huang Ying   mm, THP, swap: de...
1252
  			if (!try_to_unmap(page, flags)) {
98879b3b9   Yang Shi   mm: vmscan: corre...
1253
  				stat->nr_unmap_fail += nr_pages;
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1254
1255
  				if (!was_swapbacked && PageSwapBacked(page))
  					stat->nr_lazyfree_fail += nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1256
  				goto activate_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1257
1258
1259
1260
  			}
  		}
  
  		if (PageDirty(page)) {
ee72886d8   Mel Gorman   mm: vmscan: do no...
1261
  			/*
4eda48235   Johannes Weiner   mm: vmscan: only ...
1262
1263
1264
1265
1266
1267
1268
1269
  			 * Only kswapd can writeback filesystem pages
  			 * to avoid risk of stack overflow. But avoid
  			 * injecting inefficient single-page IO into
  			 * flusher writeback as much as possible: only
  			 * write pages when we've encountered many
  			 * dirty pages, and when we've already scanned
  			 * the rest of the LRU for clean pages and see
  			 * the same dirty pages again (PageReclaim).
ee72886d8   Mel Gorman   mm: vmscan: do no...
1270
  			 */
9de4f22a6   Huang Ying   mm: code cleanup ...
1271
  			if (page_is_file_lru(page) &&
4eda48235   Johannes Weiner   mm: vmscan: only ...
1272
1273
  			    (!current_is_kswapd() || !PageReclaim(page) ||
  			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
49ea7eb65   Mel Gorman   mm: vmscan: immed...
1274
1275
1276
1277
1278
1279
  				/*
  				 * Immediately reclaim when written back.
  				 * Similar in principal to deactivate_page()
  				 * except we already have the page isolated
  				 * and know it's dirty
  				 */
c4a25635b   Mel Gorman   mm: move vmscan w...
1280
  				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
49ea7eb65   Mel Gorman   mm: vmscan: immed...
1281
  				SetPageReclaim(page);
c55e8d035   Johannes Weiner   mm: vmscan: move ...
1282
  				goto activate_locked;
ee72886d8   Mel Gorman   mm: vmscan: do no...
1283
  			}
dfc8d636c   Johannes Weiner   vmscan: factor ou...
1284
  			if (references == PAGEREF_RECLAIM_CLEAN)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1285
  				goto keep_locked;
4dd4b9202   Andrew Morton   revert "kswapd sh...
1286
  			if (!may_enter_fs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1287
  				goto keep_locked;
52a8363ea   Christoph Lameter   [PATCH] mm: impro...
1288
  			if (!sc->may_writepage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1289
  				goto keep_locked;
d950c9477   Mel Gorman   mm: defer flush o...
1290
1291
1292
1293
1294
1295
  			/*
  			 * Page is dirty. Flush the TLB if a writable entry
  			 * potentially exists to avoid CPU writes after IO
  			 * starts and then write it out here.
  			 */
  			try_to_unmap_flush_dirty();
cb16556d9   Yang Shi   mm/vmscan.c: remo...
1296
  			switch (pageout(page, mapping)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1297
1298
1299
1300
1301
  			case PAGE_KEEP:
  				goto keep_locked;
  			case PAGE_ACTIVATE:
  				goto activate_locked;
  			case PAGE_SUCCESS:
6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1302
  				stat->nr_pageout += thp_nr_pages(page);
96f8bf4fb   Johannes Weiner   mm: vmscan: recla...
1303

7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1304
  				if (PageWriteback(page))
41ac1999c   Mel Gorman   mm: vmscan: do no...
1305
  					goto keep;
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1306
  				if (PageDirty(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1307
  					goto keep;
7d3579e8e   KOSAKI Motohiro   vmscan: narrow th...
1308

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1309
1310
1311
1312
  				/*
  				 * A synchronous write - probably a ramdisk.  Go
  				 * ahead and try to reclaim the page.
  				 */
529ae9aaa   Nick Piggin   mm: rename page t...
1313
  				if (!trylock_page(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
  					goto keep;
  				if (PageDirty(page) || PageWriteback(page))
  					goto keep_locked;
  				mapping = page_mapping(page);
  			case PAGE_CLEAN:
  				; /* try to free the page below */
  			}
  		}
  
  		/*
  		 * If the page has buffers, try to free the buffer mappings
  		 * associated with this page. If we succeed we try to free
  		 * the page as well.
  		 *
  		 * We do this even if the page is PageDirty().
  		 * try_to_release_page() does not perform I/O, but it is
  		 * possible for a page to have PageDirty set, but it is actually
  		 * clean (all its buffers are clean).  This happens if the
  		 * buffers were written out directly, with submit_bh(). ext3
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1333
  		 * will do this, as well as the blockdev mapping.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
  		 * try_to_release_page() will discover that cleanness and will
  		 * drop the buffers and mark the page clean - it can be freed.
  		 *
  		 * Rarely, pages can have buffers and no ->mapping.  These are
  		 * the pages which were not successfully invalidated in
  		 * truncate_complete_page().  We try to drop those buffers here
  		 * and if that worked, and the page is no longer mapped into
  		 * process address space (page_count == 1) it can be freed.
  		 * Otherwise, leave the page on the LRU so it is swappable.
  		 */
266cf658e   David Howells   FS-Cache: Recruit...
1344
  		if (page_has_private(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
1346
  			if (!try_to_release_page(page, sc->gfp_mask))
  				goto activate_locked;
e286781d5   Nick Piggin   mm: speculative p...
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
  			if (!mapping && page_count(page) == 1) {
  				unlock_page(page);
  				if (put_page_testzero(page))
  					goto free_it;
  				else {
  					/*
  					 * rare race with speculative reference.
  					 * the speculative reference will free
  					 * this page shortly, so we may
  					 * increment nr_reclaimed here (and
  					 * leave it off the LRU).
  					 */
  					nr_reclaimed++;
  					continue;
  				}
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1363
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1364
1365
1366
1367
1368
1369
1370
1371
  		if (PageAnon(page) && !PageSwapBacked(page)) {
  			/* follow __remove_mapping for reference */
  			if (!page_ref_freeze(page, 1))
  				goto keep_locked;
  			if (PageDirty(page)) {
  				page_ref_unfreeze(page, 1);
  				goto keep_locked;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1372

802a3a92a   Shaohua Li   mm: reclaim MADV_...
1373
  			count_vm_event(PGLAZYFREED);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1374
  			count_memcg_page_event(page, PGLAZYFREED);
b910718a9   Johannes Weiner   mm: vmscan: detec...
1375
1376
  		} else if (!mapping || !__remove_mapping(mapping, page, true,
  							 sc->target_mem_cgroup))
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1377
  			goto keep_locked;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1378
1379
  
  		unlock_page(page);
e286781d5   Nick Piggin   mm: speculative p...
1380
  free_it:
98879b3b9   Yang Shi   mm: vmscan: corre...
1381
1382
1383
1384
1385
  		/*
  		 * THP may get swapped out in a whole, need account
  		 * all base pages.
  		 */
  		nr_reclaimed += nr_pages;
abe4c3b50   Mel Gorman   vmscan: set up pa...
1386
1387
1388
1389
1390
  
  		/*
  		 * Is there need to periodically free_page_list? It would
  		 * appear not as the counts should be low
  		 */
7ae88534c   Yang Shi   mm: move mem_cgro...
1391
  		if (unlikely(PageTransHuge(page)))
ff45fc3ca   Matthew Wilcox (Oracle)   mm: simplify call...
1392
  			destroy_compound_page(page);
7ae88534c   Yang Shi   mm: move mem_cgro...
1393
  		else
bd4c82c22   Huang Ying   mm, THP, swap: de...
1394
  			list_add(&page->lru, &free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1395
  		continue;
98879b3b9   Yang Shi   mm: vmscan: corre...
1396
1397
1398
1399
1400
1401
1402
1403
1404
  activate_locked_split:
  		/*
  		 * The tail pages that are failed to add into swap cache
  		 * reach here.  Fixup nr_scanned and nr_pages.
  		 */
  		if (nr_pages > 1) {
  			sc->nr_scanned -= (nr_pages - 1);
  			nr_pages = 1;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1405
  activate_locked:
68a22394c   Rik van Riel   vmscan: free swap...
1406
  		/* Not a candidate for swapping, so reclaim swap space. */
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1407
1408
  		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
  						PageMlocked(page)))
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
1409
  			try_to_free_swap(page);
309381fea   Sasha Levin   mm: dump page whe...
1410
  		VM_BUG_ON_PAGE(PageActive(page), page);
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1411
  		if (!PageMlocked(page)) {
9de4f22a6   Huang Ying   mm: code cleanup ...
1412
  			int type = page_is_file_lru(page);
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1413
  			SetPageActive(page);
98879b3b9   Yang Shi   mm: vmscan: corre...
1414
  			stat->nr_activate[type] += nr_pages;
2262185c5   Roman Gushchin   mm: per-cgroup me...
1415
  			count_memcg_page_event(page, PGACTIVATE);
ad6b67041   Minchan Kim   mm: remove SWAP_M...
1416
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1417
1418
1419
1420
  keep_locked:
  		unlock_page(page);
  keep:
  		list_add(&page->lru, &ret_pages);
309381fea   Sasha Levin   mm: dump page whe...
1421
  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1422
  	}
abe4c3b50   Mel Gorman   vmscan: set up pa...
1423

98879b3b9   Yang Shi   mm: vmscan: corre...
1424
  	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
747db954c   Johannes Weiner   mm: memcontrol: u...
1425
  	mem_cgroup_uncharge_list(&free_pages);
72b252aed   Mel Gorman   mm: send one IPI ...
1426
  	try_to_unmap_flush();
2d4894b5d   Mel Gorman   mm: remove cold p...
1427
  	free_unref_page_list(&free_pages);
abe4c3b50   Mel Gorman   vmscan: set up pa...
1428

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1429
  	list_splice(&ret_pages, page_list);
886cf1901   Kirill Tkhai   mm: move recent_r...
1430
  	count_vm_events(PGACTIVATE, pgactivate);
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1431

05ff51376   Andrew Morton   [PATCH] vmscan re...
1432
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1433
  }
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1434
  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
02c6de8d7   Minchan Kim   mm: cma: discard ...
1435
1436
1437
1438
1439
1440
1441
  					    struct list_head *page_list)
  {
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
  		.priority = DEF_PRIORITY,
  		.may_unmap = 1,
  	};
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1442
  	struct reclaim_stat stat;
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1443
  	unsigned int nr_reclaimed;
02c6de8d7   Minchan Kim   mm: cma: discard ...
1444
1445
1446
1447
  	struct page *page, *next;
  	LIST_HEAD(clean_pages);
  
  	list_for_each_entry_safe(page, next, page_list, lru) {
9de4f22a6   Huang Ying   mm: code cleanup ...
1448
  		if (page_is_file_lru(page) && !PageDirty(page) &&
a58f2cef2   Minchan Kim   mm/vmscan.c: fix ...
1449
  		    !__PageMovable(page) && !PageUnevictable(page)) {
02c6de8d7   Minchan Kim   mm: cma: discard ...
1450
1451
1452
1453
  			ClearPageActive(page);
  			list_move(&page->lru, &clean_pages);
  		}
  	}
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1454
  	nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
dd156e3fc   Shakeel Butt   mm/rmap: always d...
1455
  					&stat, true);
02c6de8d7   Minchan Kim   mm: cma: discard ...
1456
  	list_splice(&clean_pages, page_list);
2da9f6305   Nicholas Piggin   mm/vmscan: fix NR...
1457
1458
  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
  			    -(long)nr_reclaimed);
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1459
1460
1461
1462
1463
1464
1465
1466
1467
  	/*
  	 * Since lazyfree pages are isolated from file LRU from the beginning,
  	 * they will rotate back to anonymous LRU in the end if it failed to
  	 * discard so isolated count will be mismatched.
  	 * Compensate the isolated count for both LRU lists.
  	 */
  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
  			    stat.nr_lazyfree_fail);
  	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2da9f6305   Nicholas Piggin   mm/vmscan: fix NR...
1468
  			    -(long)stat.nr_lazyfree_fail);
1f318a9b0   Jaewon Kim   mm/vmscan: count ...
1469
  	return nr_reclaimed;
02c6de8d7   Minchan Kim   mm: cma: discard ...
1470
  }
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
  /*
   * Attempt to remove the specified page from its LRU.  Only take this page
   * if it is of the appropriate PageActive status.  Pages which are being
   * freed elsewhere are also ignored.
   *
   * page:	page to consider
   * mode:	one of the LRU isolation modes defined above
   *
   * returns 0 on success, -ve errno on failure.
   */
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1481
  int __isolate_lru_page(struct page *page, isolate_mode_t mode)
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1482
1483
1484
1485
1486
1487
  {
  	int ret = -EINVAL;
  
  	/* Only take pages on the LRU. */
  	if (!PageLRU(page))
  		return ret;
e46a28790   Minchan Kim   CMA: migrate mloc...
1488
1489
  	/* Compaction should not handle unevictable pages but CMA can do so */
  	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1490
  		return ret;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1491
  	ret = -EBUSY;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1492

c82449352   Mel Gorman   mm: compaction: m...
1493
1494
1495
1496
1497
  	/*
  	 * To minimise LRU disruption, the caller can indicate that it only
  	 * wants to isolate pages it will be able to operate on without
  	 * blocking - clean pages for the most part.
  	 *
c82449352   Mel Gorman   mm: compaction: m...
1498
1499
1500
  	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
  	 * that it is possible to migrate without blocking
  	 */
1276ad68e   Johannes Weiner   mm: vmscan: scan ...
1501
  	if (mode & ISOLATE_ASYNC_MIGRATE) {
c82449352   Mel Gorman   mm: compaction: m...
1502
1503
1504
1505
1506
1507
  		/* All the caller can do on PageWriteback is block */
  		if (PageWriteback(page))
  			return ret;
  
  		if (PageDirty(page)) {
  			struct address_space *mapping;
69d763fc6   Mel Gorman   mm: pin address_s...
1508
  			bool migrate_dirty;
c82449352   Mel Gorman   mm: compaction: m...
1509

c82449352   Mel Gorman   mm: compaction: m...
1510
1511
1512
  			/*
  			 * Only pages without mappings or that have a
  			 * ->migratepage callback are possible to migrate
69d763fc6   Mel Gorman   mm: pin address_s...
1513
1514
1515
1516
1517
  			 * without blocking. However, we can be racing with
  			 * truncation so it's necessary to lock the page
  			 * to stabilise the mapping as truncation holds
  			 * the page lock until after the page is removed
  			 * from the page cache.
c82449352   Mel Gorman   mm: compaction: m...
1518
  			 */
69d763fc6   Mel Gorman   mm: pin address_s...
1519
1520
  			if (!trylock_page(page))
  				return ret;
c82449352   Mel Gorman   mm: compaction: m...
1521
  			mapping = page_mapping(page);
145e1a71e   Hugh Dickins   mm: fix the NULL ...
1522
  			migrate_dirty = !mapping || mapping->a_ops->migratepage;
69d763fc6   Mel Gorman   mm: pin address_s...
1523
1524
  			unlock_page(page);
  			if (!migrate_dirty)
c82449352   Mel Gorman   mm: compaction: m...
1525
1526
1527
  				return ret;
  		}
  	}
39deaf858   Minchan Kim   mm: compaction: m...
1528

f80c06736   Minchan Kim   mm: zone_reclaim:...
1529
1530
  	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
  		return ret;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
  	if (likely(get_page_unless_zero(page))) {
  		/*
  		 * Be careful not to clear PageLRU until after we're
  		 * sure the page is not being freed elsewhere -- the
  		 * page release code relies on it.
  		 */
  		ClearPageLRU(page);
  		ret = 0;
  	}
  
  	return ret;
  }
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1543
1544
1545
  
  /*
   * Update LRU sizes after isolating pages. The LRU size updates must
55b65a57c   Ethon Paul   mm/vmsan: fix som...
1546
   * be complete before mem_cgroup_update_lru_size due to a sanity check.
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1547
1548
   */
  static __always_inline void update_lru_sizes(struct lruvec *lruvec,
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1549
  			enum lru_list lru, unsigned long *nr_zone_taken)
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1550
  {
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1551
  	int zid;
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1552
1553
1554
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		if (!nr_zone_taken[zid])
  			continue;
a892cb6b9   Wei Yang   mm/vmscan.c: use ...
1555
  		update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1556
  	}
7ee36a14f   Mel Gorman   mm, vmscan: Updat...
1557
  }
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
1558
1559
  /**
   * pgdat->lru_lock is heavily contended.  Some of the functions that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1560
1561
1562
1563
1564
1565
1566
1567
   * shrink the lists perform better by taking out a batch of pages
   * and working on them outside the LRU lock.
   *
   * For pagecache intensive workloads, this function is the hottest
   * spot in the kernel (apart from copy_*_user functions).
   *
   * Appropriate locks must be held before calling this function.
   *
791b48b64   Minchan Kim   mm: vmscan: scan ...
1568
   * @nr_to_scan:	The number of eligible pages to look through on the list.
5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1569
   * @lruvec:	The LRU vector to pull pages from.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1570
   * @dst:	The temp list to put pages on to.
f626012db   Hugh Dickins   mm: remove isolat...
1571
   * @nr_scanned:	The number of pages that were scanned.
fe2c2a106   Rik van Riel   vmscan: reclaim a...
1572
   * @sc:		The scan_control struct for this reclaim session
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1573
   * @lru:	LRU list id for isolating
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1574
1575
1576
   *
   * returns how many pages were moved onto *@dst.
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1577
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1578
  		struct lruvec *lruvec, struct list_head *dst,
fe2c2a106   Rik van Riel   vmscan: reclaim a...
1579
  		unsigned long *nr_scanned, struct scan_control *sc,
a9e7c39fa   Kirill Tkhai   mm/vmscan.c: remo...
1580
  		enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1581
  {
75b00af77   Hugh Dickins   mm: trivial clean...
1582
  	struct list_head *src = &lruvec->lists[lru];
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1583
  	unsigned long nr_taken = 0;
599d0c954   Mel Gorman   mm, vmscan: move ...
1584
  	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1585
  	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
3db65812d   Johannes Weiner   Revert "mm, vmsca...
1586
  	unsigned long skipped = 0;
791b48b64   Minchan Kim   mm: vmscan: scan ...
1587
  	unsigned long scan, total_scan, nr_pages;
b2e18757f   Mel Gorman   mm, vmscan: begin...
1588
  	LIST_HEAD(pages_skipped);
a9e7c39fa   Kirill Tkhai   mm/vmscan.c: remo...
1589
  	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1590

98879b3b9   Yang Shi   mm: vmscan: corre...
1591
  	total_scan = 0;
791b48b64   Minchan Kim   mm: vmscan: scan ...
1592
  	scan = 0;
98879b3b9   Yang Shi   mm: vmscan: corre...
1593
  	while (scan < nr_to_scan && !list_empty(src)) {
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1594
  		struct page *page;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1595

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1596
1597
  		page = lru_to_page(src);
  		prefetchw_prev_lru_page(page, src, flags);
309381fea   Sasha Levin   mm: dump page whe...
1598
  		VM_BUG_ON_PAGE(!PageLRU(page), page);
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
1599

d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
1600
  		nr_pages = compound_nr(page);
98879b3b9   Yang Shi   mm: vmscan: corre...
1601
  		total_scan += nr_pages;
b2e18757f   Mel Gorman   mm, vmscan: begin...
1602
1603
  		if (page_zonenum(page) > sc->reclaim_idx) {
  			list_move(&page->lru, &pages_skipped);
98879b3b9   Yang Shi   mm: vmscan: corre...
1604
  			nr_skipped[page_zonenum(page)] += nr_pages;
b2e18757f   Mel Gorman   mm, vmscan: begin...
1605
1606
  			continue;
  		}
791b48b64   Minchan Kim   mm: vmscan: scan ...
1607
1608
1609
1610
1611
  		/*
  		 * Do not count skipped pages because that makes the function
  		 * return with no isolated pages if the LRU mostly contains
  		 * ineligible pages.  This causes the VM to not reclaim any
  		 * pages, triggering a premature OOM.
98879b3b9   Yang Shi   mm: vmscan: corre...
1612
1613
1614
1615
  		 *
  		 * Account all tail pages of THP.  This would not cause
  		 * premature OOM since __isolate_lru_page() returns -EBUSY
  		 * only when the page is being freed somewhere else.
791b48b64   Minchan Kim   mm: vmscan: scan ...
1616
  		 */
98879b3b9   Yang Shi   mm: vmscan: corre...
1617
  		scan += nr_pages;
f3fd4a619   Konstantin Khlebnikov   mm: remove lru ty...
1618
  		switch (__isolate_lru_page(page, mode)) {
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1619
  		case 0:
599d0c954   Mel Gorman   mm, vmscan: move ...
1620
1621
  			nr_taken += nr_pages;
  			nr_zone_taken[page_zonenum(page)] += nr_pages;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1622
  			list_move(&page->lru, dst);
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1623
1624
1625
1626
1627
1628
  			break;
  
  		case -EBUSY:
  			/* else it is being freed elsewhere */
  			list_move(&page->lru, src);
  			continue;
46453a6e1   Nick Piggin   [PATCH] mm: never...
1629

5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1630
1631
1632
  		default:
  			BUG();
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1633
  	}
b2e18757f   Mel Gorman   mm, vmscan: begin...
1634
1635
1636
1637
1638
1639
1640
  	/*
  	 * Splice any skipped pages to the start of the LRU list. Note that
  	 * this disrupts the LRU order when reclaiming for lower zones but
  	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
  	 * scanning would soon rescan the same pages to skip and put the
  	 * system at risk of premature OOM.
  	 */
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1641
1642
  	if (!list_empty(&pages_skipped)) {
  		int zid;
3db65812d   Johannes Weiner   Revert "mm, vmsca...
1643
  		list_splice(&pages_skipped, src);
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1644
1645
1646
1647
1648
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			if (!nr_skipped[zid])
  				continue;
  
  			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1265e3a69   Michal Hocko   mm, vmscan: show ...
1649
  			skipped += nr_skipped[zid];
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
1650
1651
  		}
  	}
791b48b64   Minchan Kim   mm: vmscan: scan ...
1652
  	*nr_scanned = total_scan;
1265e3a69   Michal Hocko   mm, vmscan: show ...
1653
  	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
791b48b64   Minchan Kim   mm: vmscan: scan ...
1654
  				    total_scan, skipped, nr_taken, mode, lru);
b4536f0c8   Michal Hocko   mm, memcg: fix th...
1655
  	update_lru_sizes(lruvec, lru, nr_zone_taken);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1656
1657
  	return nr_taken;
  }
62695a84e   Nick Piggin   vmscan: move isol...
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
  /**
   * isolate_lru_page - tries to isolate a page from its LRU list
   * @page: page to isolate from its LRU list
   *
   * Isolates a @page from an LRU list, clears PageLRU and adjusts the
   * vmstat statistic corresponding to whatever LRU list the page was on.
   *
   * Returns 0 if the page was removed from an LRU list.
   * Returns -EBUSY if the page was not on an LRU list.
   *
   * The returned page will have PageLRU() cleared.  If it was found on
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1669
1670
1671
   * the active list, it will have PageActive set.  If it was found on
   * the unevictable list, it will have the PageUnevictable bit set. That flag
   * may need to be cleared by the caller before letting the page go.
62695a84e   Nick Piggin   vmscan: move isol...
1672
1673
1674
1675
1676
   *
   * The vmstat statistic corresponding to the list on which the page was
   * found will be decremented.
   *
   * Restrictions:
a5d09bed7   Mike Rapoport   mm: docs: add bla...
1677
   *
62695a84e   Nick Piggin   vmscan: move isol...
1678
   * (1) Must be called with an elevated refcount on the page. This is a
01c4776ba   Hui Su   mm/vmscan: fix co...
1679
   *     fundamental difference from isolate_lru_pages (which is called
62695a84e   Nick Piggin   vmscan: move isol...
1680
1681
1682
1683
1684
1685
1686
   *     without a stable reference).
   * (2) the lru_lock must not be held.
   * (3) interrupts must be enabled.
   */
  int isolate_lru_page(struct page *page)
  {
  	int ret = -EBUSY;
309381fea   Sasha Levin   mm: dump page whe...
1687
  	VM_BUG_ON_PAGE(!page_count(page), page);
cf2a82ee4   Kirill A. Shutemov   mm: downgrade VM_...
1688
  	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1689

62695a84e   Nick Piggin   vmscan: move isol...
1690
  	if (PageLRU(page)) {
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
1691
  		pg_data_t *pgdat = page_pgdat(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1692
  		struct lruvec *lruvec;
62695a84e   Nick Piggin   vmscan: move isol...
1693

f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
1694
1695
  		spin_lock_irq(&pgdat->lru_lock);
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1696
  		if (PageLRU(page)) {
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1697
  			int lru = page_lru(page);
0c917313a   Konstantin Khlebnikov   mm: strictly requ...
1698
  			get_page(page);
62695a84e   Nick Piggin   vmscan: move isol...
1699
  			ClearPageLRU(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1700
1701
  			del_page_from_lru_list(page, lruvec, lru);
  			ret = 0;
62695a84e   Nick Piggin   vmscan: move isol...
1702
  		}
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
1703
  		spin_unlock_irq(&pgdat->lru_lock);
62695a84e   Nick Piggin   vmscan: move isol...
1704
1705
1706
  	}
  	return ret;
  }
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1707
  /*
d37dd5dcb   Fengguang Wu   vmscan: comment t...
1708
   * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
178821b89   Xianting Tian   mm/vmscan.c: fix ...
1709
   * then get rescheduled. When there are massive number of tasks doing page
d37dd5dcb   Fengguang Wu   vmscan: comment t...
1710
1711
1712
   * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
   * the LRU list will go small and be scanned faster than necessary, leading to
   * unnecessary swapping, thrashing and OOM.
35cd78156   Rik van Riel   vmscan: throttle ...
1713
   */
599d0c954   Mel Gorman   mm, vmscan: move ...
1714
  static int too_many_isolated(struct pglist_data *pgdat, int file,
35cd78156   Rik van Riel   vmscan: throttle ...
1715
1716
1717
1718
1719
1720
  		struct scan_control *sc)
  {
  	unsigned long inactive, isolated;
  
  	if (current_is_kswapd())
  		return 0;
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
1721
  	if (!writeback_throttling_sane(sc))
35cd78156   Rik van Riel   vmscan: throttle ...
1722
1723
1724
  		return 0;
  
  	if (file) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1725
1726
  		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
  		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
35cd78156   Rik van Riel   vmscan: throttle ...
1727
  	} else {
599d0c954   Mel Gorman   mm, vmscan: move ...
1728
1729
  		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
  		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
35cd78156   Rik van Riel   vmscan: throttle ...
1730
  	}
3cf23841b   Fengguang Wu   mm/vmscan.c: avoi...
1731
1732
1733
1734
1735
  	/*
  	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
  	 * won't get blocked by normal direct-reclaimers, forming a circular
  	 * deadlock.
  	 */
d0164adc8   Mel Gorman   mm, page_alloc: d...
1736
  	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
3cf23841b   Fengguang Wu   mm/vmscan.c: avoi...
1737
  		inactive >>= 3;
35cd78156   Rik van Riel   vmscan: throttle ...
1738
1739
  	return isolated > inactive;
  }
a222f3415   Kirill Tkhai   mm: generalize pu...
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
  /*
   * This moves pages from @list to corresponding LRU list.
   *
   * We move them the other way if the page is referenced by one or more
   * processes, from rmap.
   *
   * If the pages are mostly unmapped, the processing is fast and it is
   * appropriate to hold zone_lru_lock across the whole operation.  But if
   * the pages are mapped, the processing is slow (page_referenced()) so we
   * should drop zone_lru_lock around each page.  It's impossible to balance
   * this, so instead we remove the pages from the LRU while processing them.
   * It is safe to rely on PG_active against the non-LRU pages in here because
   * nobody will play with that bit on a non-LRU page.
   *
   * The downside is that we have to touch page->_refcount against each page.
   * But we had to alter page->flags anyway.
   *
   * Returns the number of pages moved to the given lruvec.
   */
  
  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
  						     struct list_head *list)
666356297   Mel Gorman   vmscan: set up pa...
1762
  {
599d0c954   Mel Gorman   mm, vmscan: move ...
1763
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
a222f3415   Kirill Tkhai   mm: generalize pu...
1764
  	int nr_pages, nr_moved = 0;
3f79768f2   Hugh Dickins   mm: rearrange put...
1765
  	LIST_HEAD(pages_to_free);
a222f3415   Kirill Tkhai   mm: generalize pu...
1766
1767
  	struct page *page;
  	enum lru_list lru;
666356297   Mel Gorman   vmscan: set up pa...
1768

a222f3415   Kirill Tkhai   mm: generalize pu...
1769
1770
  	while (!list_empty(list)) {
  		page = lru_to_page(list);
309381fea   Sasha Levin   mm: dump page whe...
1771
  		VM_BUG_ON_PAGE(PageLRU(page), page);
39b5f29ac   Hugh Dickins   mm: remove vma ar...
1772
  		if (unlikely(!page_evictable(page))) {
a222f3415   Kirill Tkhai   mm: generalize pu...
1773
  			list_del(&page->lru);
599d0c954   Mel Gorman   mm, vmscan: move ...
1774
  			spin_unlock_irq(&pgdat->lru_lock);
666356297   Mel Gorman   vmscan: set up pa...
1775
  			putback_lru_page(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1776
  			spin_lock_irq(&pgdat->lru_lock);
666356297   Mel Gorman   vmscan: set up pa...
1777
1778
  			continue;
  		}
599d0c954   Mel Gorman   mm, vmscan: move ...
1779
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1780

7a608572a   Linus Torvalds   Revert "mm: batch...
1781
  		SetPageLRU(page);
666356297   Mel Gorman   vmscan: set up pa...
1782
  		lru = page_lru(page);
a222f3415   Kirill Tkhai   mm: generalize pu...
1783

6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1784
  		nr_pages = thp_nr_pages(page);
a222f3415   Kirill Tkhai   mm: generalize pu...
1785
1786
  		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
  		list_move(&page->lru, &lruvec->lists[lru]);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1787

2bcf88796   Hugh Dickins   mm: take pagevecs...
1788
1789
1790
  		if (put_page_testzero(page)) {
  			__ClearPageLRU(page);
  			__ClearPageActive(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
1791
  			del_page_from_lru_list(page, lruvec, lru);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1792
1793
  
  			if (unlikely(PageCompound(page))) {
599d0c954   Mel Gorman   mm, vmscan: move ...
1794
  				spin_unlock_irq(&pgdat->lru_lock);
ff45fc3ca   Matthew Wilcox (Oracle)   mm: simplify call...
1795
  				destroy_compound_page(page);
599d0c954   Mel Gorman   mm, vmscan: move ...
1796
  				spin_lock_irq(&pgdat->lru_lock);
2bcf88796   Hugh Dickins   mm: take pagevecs...
1797
1798
  			} else
  				list_add(&page->lru, &pages_to_free);
a222f3415   Kirill Tkhai   mm: generalize pu...
1799
1800
  		} else {
  			nr_moved += nr_pages;
31d8fcac0   Johannes Weiner   mm: workingset: a...
1801
1802
  			if (PageActive(page))
  				workingset_age_nonresident(lruvec, nr_pages);
666356297   Mel Gorman   vmscan: set up pa...
1803
1804
  		}
  	}
666356297   Mel Gorman   vmscan: set up pa...
1805

3f79768f2   Hugh Dickins   mm: rearrange put...
1806
1807
1808
  	/*
  	 * To save our caller's stack, now use input list for pages to free.
  	 */
a222f3415   Kirill Tkhai   mm: generalize pu...
1809
1810
1811
  	list_splice(&pages_to_free, list);
  
  	return nr_moved;
666356297   Mel Gorman   vmscan: set up pa...
1812
1813
1814
  }
  
  /*
399ba0b95   NeilBrown   mm/vmscan.c: avoi...
1815
   * If a kernel thread (such as nfsd for loop-back mounts) services
a37b0715d   NeilBrown   mm/writeback: rep...
1816
   * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
399ba0b95   NeilBrown   mm/vmscan.c: avoi...
1817
1818
1819
1820
1821
   * In that case we should only throttle if the backing device it is
   * writing to is congested.  In other cases it is safe to throttle.
   */
  static int current_may_throttle(void)
  {
a37b0715d   NeilBrown   mm/writeback: rep...
1822
  	return !(current->flags & PF_LOCAL_THROTTLE) ||
399ba0b95   NeilBrown   mm/vmscan.c: avoi...
1823
1824
1825
1826
1827
  		current->backing_dev_info == NULL ||
  		bdi_write_congested(current->backing_dev_info);
  }
  
  /*
b2e18757f   Mel Gorman   mm, vmscan: begin...
1828
   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
1829
   * of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830
   */
666356297   Mel Gorman   vmscan: set up pa...
1831
  static noinline_for_stack unsigned long
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1832
  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
1833
  		     struct scan_control *sc, enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1834
1835
  {
  	LIST_HEAD(page_list);
e247dbce5   KOSAKI Motohiro   vmscan: simplify ...
1836
  	unsigned long nr_scanned;
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
1837
  	unsigned int nr_reclaimed = 0;
e247dbce5   KOSAKI Motohiro   vmscan: simplify ...
1838
  	unsigned long nr_taken;
060f005f0   Kirill Tkhai   mm/vmscan.c: do n...
1839
  	struct reclaim_stat stat;
497a6c1b0   Johannes Weiner   mm: keep separate...
1840
  	bool file = is_file_lru(lru);
f46b79120   Kirill Tkhai   mm/vmscan.c: simp...
1841
  	enum vm_event_item item;
599d0c954   Mel Gorman   mm, vmscan: move ...
1842
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
db73ee0d4   Michal Hocko   mm, vmscan: do no...
1843
  	bool stalled = false;
78dc583d3   KOSAKI Motohiro   vmscan: low order...
1844

599d0c954   Mel Gorman   mm, vmscan: move ...
1845
  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
db73ee0d4   Michal Hocko   mm, vmscan: do no...
1846
1847
1848
1849
1850
1851
  		if (stalled)
  			return 0;
  
  		/* wait a bit for the reclaimer. */
  		msleep(100);
  		stalled = true;
35cd78156   Rik van Riel   vmscan: throttle ...
1852
1853
1854
1855
1856
  
  		/* We are about to die and free our memory. Return now. */
  		if (fatal_signal_pending(current))
  			return SWAP_CLUSTER_MAX;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1857
  	lru_add_drain();
f80c06736   Minchan Kim   mm: zone_reclaim:...
1858

599d0c954   Mel Gorman   mm, vmscan: move ...
1859
  	spin_lock_irq(&pgdat->lru_lock);
b35ea17b7   KOSAKI Motohiro   mm: shrink_inacti...
1860

5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1861
  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
a9e7c39fa   Kirill Tkhai   mm/vmscan.c: remo...
1862
  				     &nr_scanned, sc, lru);
95d918fc0   Konstantin Khlebnikov   mm/vmscan: remove...
1863

599d0c954   Mel Gorman   mm, vmscan: move ...
1864
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
f46b79120   Kirill Tkhai   mm/vmscan.c: simp...
1865
  	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
1866
  	if (!cgroup_reclaim(sc))
f46b79120   Kirill Tkhai   mm/vmscan.c: simp...
1867
1868
  		__count_vm_events(item, nr_scanned);
  	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
497a6c1b0   Johannes Weiner   mm: keep separate...
1869
  	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
599d0c954   Mel Gorman   mm, vmscan: move ...
1870
  	spin_unlock_irq(&pgdat->lru_lock);
b35ea17b7   KOSAKI Motohiro   mm: shrink_inacti...
1871

d563c0501   Hillf Danton   vmscan: handle is...
1872
  	if (nr_taken == 0)
666356297   Mel Gorman   vmscan: set up pa...
1873
  		return 0;
5ad333eb6   Andy Whitcroft   Lumpy Reclaim V4
1874

dd156e3fc   Shakeel Butt   mm/rmap: always d...
1875
  	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
c661b078f   Andy Whitcroft   synchronous lumpy...
1876

599d0c954   Mel Gorman   mm, vmscan: move ...
1877
  	spin_lock_irq(&pgdat->lru_lock);
3f79768f2   Hugh Dickins   mm: rearrange put...
1878

497a6c1b0   Johannes Weiner   mm: keep separate...
1879
1880
1881
  	move_pages_to_lru(lruvec, &page_list);
  
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
96f8bf4fb   Johannes Weiner   mm: vmscan: recla...
1882
  	lru_note_cost(lruvec, file, stat.nr_pageout);
f46b79120   Kirill Tkhai   mm/vmscan.c: simp...
1883
  	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
1884
  	if (!cgroup_reclaim(sc))
f46b79120   Kirill Tkhai   mm/vmscan.c: simp...
1885
1886
  		__count_vm_events(item, nr_reclaimed);
  	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
497a6c1b0   Johannes Weiner   mm: keep separate...
1887
  	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
3f79768f2   Hugh Dickins   mm: rearrange put...
1888

599d0c954   Mel Gorman   mm, vmscan: move ...
1889
  	spin_unlock_irq(&pgdat->lru_lock);
3f79768f2   Hugh Dickins   mm: rearrange put...
1890

747db954c   Johannes Weiner   mm: memcontrol: u...
1891
  	mem_cgroup_uncharge_list(&page_list);
2d4894b5d   Mel Gorman   mm: remove cold p...
1892
  	free_unref_page_list(&page_list);
e11da5b4f   Mel Gorman   tracing, vmscan: ...
1893

92df3a723   Mel Gorman   mm: vmscan: throt...
1894
  	/*
1c610d5f9   Andrey Ryabinin   mm/vmscan: wake u...
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
  	 * If dirty pages are scanned that are not queued for IO, it
  	 * implies that flushers are not doing their job. This can
  	 * happen when memory pressure pushes dirty pages to the end of
  	 * the LRU before the dirty limits are breached and the dirty
  	 * data has expired. It can also happen when the proportion of
  	 * dirty pages grows not through writes but through memory
  	 * pressure reclaiming all the clean cache. And in some cases,
  	 * the flushers simply cannot keep up with the allocation
  	 * rate. Nudge the flusher threads in case they are asleep.
  	 */
  	if (stat.nr_unqueued_dirty == nr_taken)
  		wakeup_flusher_threads(WB_REASON_VMSCAN);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
1907
1908
1909
1910
1911
1912
1913
1914
  	sc->nr.dirty += stat.nr_dirty;
  	sc->nr.congested += stat.nr_congested;
  	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
  	sc->nr.writeback += stat.nr_writeback;
  	sc->nr.immediate += stat.nr_immediate;
  	sc->nr.taken += nr_taken;
  	if (file)
  		sc->nr.file_taken += nr_taken;
8e9502828   Mel Gorman   mm: vmscan: move ...
1915

599d0c954   Mel Gorman   mm, vmscan: move ...
1916
  	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
d51d1e645   Steven Rostedt   mm, vmscan, traci...
1917
  			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
05ff51376   Andrew Morton   [PATCH] vmscan re...
1918
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1919
  }
f626012db   Hugh Dickins   mm: remove isolat...
1920
  static void shrink_active_list(unsigned long nr_to_scan,
1a93be0e7   Konstantin Khlebnikov   mm/vmscan: push l...
1921
  			       struct lruvec *lruvec,
f16015fbf   Johannes Weiner   mm: vmscan: disti...
1922
  			       struct scan_control *sc,
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
1923
  			       enum lru_list lru)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1924
  {
44c241f16   KOSAKI Motohiro   mm: rename pgmove...
1925
  	unsigned long nr_taken;
f626012db   Hugh Dickins   mm: remove isolat...
1926
  	unsigned long nr_scanned;
6fe6b7e35   Wu Fengguang   vmscan: report vm...
1927
  	unsigned long vm_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1928
  	LIST_HEAD(l_hold);	/* The pages which were snipped off */
8cab4754d   Wu Fengguang   vmscan: make mapp...
1929
  	LIST_HEAD(l_active);
b69408e88   Christoph Lameter   vmscan: Use an in...
1930
  	LIST_HEAD(l_inactive);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1931
  	struct page *page;
9d998b4f1   Michal Hocko   mm, vmscan: add a...
1932
1933
  	unsigned nr_deactivate, nr_activate;
  	unsigned nr_rotated = 0;
3cb994517   Konstantin Khlebnikov   mm: push lru inde...
1934
  	int file = is_file_lru(lru);
599d0c954   Mel Gorman   mm, vmscan: move ...
1935
  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1936
1937
  
  	lru_add_drain();
f80c06736   Minchan Kim   mm: zone_reclaim:...
1938

599d0c954   Mel Gorman   mm, vmscan: move ...
1939
  	spin_lock_irq(&pgdat->lru_lock);
925b7673c   Johannes Weiner   mm: make per-memc...
1940

5dc35979e   Konstantin Khlebnikov   mm/vmscan: push l...
1941
  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
a9e7c39fa   Kirill Tkhai   mm/vmscan.c: remo...
1942
  				     &nr_scanned, sc, lru);
89b5fae53   Johannes Weiner   mm: vmscan: disti...
1943

599d0c954   Mel Gorman   mm, vmscan: move ...
1944
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
1945

912c05720   Shakeel Butt   mm: vmscan: consi...
1946
1947
  	if (!cgroup_reclaim(sc))
  		__count_vm_events(PGREFILL, nr_scanned);
2fa2690ca   Yafang Shao   mm/vmscan.c: don'...
1948
  	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
9d5e6a9f2   Hugh Dickins   mm: update_lru_si...
1949

599d0c954   Mel Gorman   mm, vmscan: move ...
1950
  	spin_unlock_irq(&pgdat->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1951

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1952
1953
1954
1955
  	while (!list_empty(&l_hold)) {
  		cond_resched();
  		page = lru_to_page(&l_hold);
  		list_del(&page->lru);
7e9cd4842   Rik van Riel   vmscan: fix pagec...
1956

39b5f29ac   Hugh Dickins   mm: remove vma ar...
1957
  		if (unlikely(!page_evictable(page))) {
894bc3104   Lee Schermerhorn   Unevictable LRU I...
1958
1959
1960
  			putback_lru_page(page);
  			continue;
  		}
cc715d99e   Mel Gorman   mm: vmscan: forci...
1961
1962
1963
1964
1965
1966
1967
  		if (unlikely(buffer_heads_over_limit)) {
  			if (page_has_private(page) && trylock_page(page)) {
  				if (page_has_private(page))
  					try_to_release_page(page, 0);
  				unlock_page(page);
  			}
  		}
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1968
1969
  		if (page_referenced(page, 0, sc->target_mem_cgroup,
  				    &vm_flags)) {
8cab4754d   Wu Fengguang   vmscan: make mapp...
1970
1971
1972
1973
1974
1975
1976
1977
1978
  			/*
  			 * Identify referenced, file-backed active pages and
  			 * give them one more trip around the active list. So
  			 * that executable code get better chances to stay in
  			 * memory under moderate memory pressure.  Anon pages
  			 * are not likely to be evicted by use-once streaming
  			 * IO, plus JVM can create lots of anon VM_EXEC pages,
  			 * so we ignore them here.
  			 */
9de4f22a6   Huang Ying   mm: code cleanup ...
1979
  			if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1980
  				nr_rotated += thp_nr_pages(page);
8cab4754d   Wu Fengguang   vmscan: make mapp...
1981
1982
1983
1984
  				list_add(&page->lru, &l_active);
  				continue;
  			}
  		}
7e9cd4842   Rik van Riel   vmscan: fix pagec...
1985

5205e56ee   KOSAKI Motohiro   vmscan: move Clea...
1986
  		ClearPageActive(page);	/* we are de-activating */
1899ad18c   Johannes Weiner   mm: workingset: t...
1987
  		SetPageWorkingset(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1988
1989
  		list_add(&page->lru, &l_inactive);
  	}
b555749aa   Andrew Morton   vmscan: shrink_ac...
1990
  	/*
8cab4754d   Wu Fengguang   vmscan: make mapp...
1991
  	 * Move pages back to the lru list.
b555749aa   Andrew Morton   vmscan: shrink_ac...
1992
  	 */
599d0c954   Mel Gorman   mm, vmscan: move ...
1993
  	spin_lock_irq(&pgdat->lru_lock);
556adecba   Rik van Riel   vmscan: second ch...
1994

a222f3415   Kirill Tkhai   mm: generalize pu...
1995
1996
  	nr_activate = move_pages_to_lru(lruvec, &l_active);
  	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
f372d89e5   Kirill Tkhai   mm: remove pages_...
1997
1998
  	/* Keep all free pages in l_active list */
  	list_splice(&l_inactive, &l_active);
9851ac135   Kirill Tkhai   mm: move nr_deact...
1999
2000
2001
  
  	__count_vm_events(PGDEACTIVATE, nr_deactivate);
  	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
599d0c954   Mel Gorman   mm, vmscan: move ...
2002
2003
  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
  	spin_unlock_irq(&pgdat->lru_lock);
2bcf88796   Hugh Dickins   mm: take pagevecs...
2004

f372d89e5   Kirill Tkhai   mm: remove pages_...
2005
2006
  	mem_cgroup_uncharge_list(&l_active);
  	free_unref_page_list(&l_active);
9d998b4f1   Michal Hocko   mm, vmscan: add a...
2007
2008
  	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
  			nr_deactivate, nr_rotated, sc->priority, file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2009
  }
1a4e58cce   Minchan Kim   mm: introduce MAD...
2010
2011
  unsigned long reclaim_pages(struct list_head *page_list)
  {
f661d007f   Yang Shi   mm: vmscan: repla...
2012
  	int nid = NUMA_NO_NODE;
730ec8c01   Maninder Singh   mm/vmscan.c: chan...
2013
  	unsigned int nr_reclaimed = 0;
1a4e58cce   Minchan Kim   mm: introduce MAD...
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
  	LIST_HEAD(node_page_list);
  	struct reclaim_stat dummy_stat;
  	struct page *page;
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
  		.priority = DEF_PRIORITY,
  		.may_writepage = 1,
  		.may_unmap = 1,
  		.may_swap = 1,
  	};
  
  	while (!list_empty(page_list)) {
  		page = lru_to_page(page_list);
f661d007f   Yang Shi   mm: vmscan: repla...
2027
  		if (nid == NUMA_NO_NODE) {
1a4e58cce   Minchan Kim   mm: introduce MAD...
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
  			nid = page_to_nid(page);
  			INIT_LIST_HEAD(&node_page_list);
  		}
  
  		if (nid == page_to_nid(page)) {
  			ClearPageActive(page);
  			list_move(&page->lru, &node_page_list);
  			continue;
  		}
  
  		nr_reclaimed += shrink_page_list(&node_page_list,
  						NODE_DATA(nid),
dd156e3fc   Shakeel Butt   mm/rmap: always d...
2040
  						&sc, &dummy_stat, false);
1a4e58cce   Minchan Kim   mm: introduce MAD...
2041
2042
2043
2044
2045
  		while (!list_empty(&node_page_list)) {
  			page = lru_to_page(&node_page_list);
  			list_del(&page->lru);
  			putback_lru_page(page);
  		}
f661d007f   Yang Shi   mm: vmscan: repla...
2046
  		nid = NUMA_NO_NODE;
1a4e58cce   Minchan Kim   mm: introduce MAD...
2047
2048
2049
2050
2051
  	}
  
  	if (!list_empty(&node_page_list)) {
  		nr_reclaimed += shrink_page_list(&node_page_list,
  						NODE_DATA(nid),
dd156e3fc   Shakeel Butt   mm/rmap: always d...
2052
  						&sc, &dummy_stat, false);
1a4e58cce   Minchan Kim   mm: introduce MAD...
2053
2054
2055
2056
2057
2058
2059
2060
2061
  		while (!list_empty(&node_page_list)) {
  			page = lru_to_page(&node_page_list);
  			list_del(&page->lru);
  			putback_lru_page(page);
  		}
  	}
  
  	return nr_reclaimed;
  }
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
  				 struct lruvec *lruvec, struct scan_control *sc)
  {
  	if (is_active_lru(lru)) {
  		if (sc->may_deactivate & (1 << is_file_lru(lru)))
  			shrink_active_list(nr_to_scan, lruvec, sc, lru);
  		else
  			sc->skipped_deactivate = 1;
  		return 0;
  	}
  
  	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
  }
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2075
2076
2077
  /*
   * The inactive anon list should be small enough that the VM never has
   * to do too much work.
14797e236   KOSAKI Motohiro   memcg: add inacti...
2078
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2079
2080
2081
   * The inactive file list should be small enough to leave most memory
   * to the established workingset on the scan-resistant active list,
   * but large enough to avoid thrashing the aggregate readahead window.
56e49d218   Rik van Riel   vmscan: evict use...
2082
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2083
2084
   * Both inactive lists should also be large enough that each inactive
   * page has a chance to be referenced again before it is reclaimed.
56e49d218   Rik van Riel   vmscan: evict use...
2085
   *
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2086
2087
   * If that fails and refaulting is observed, the inactive list grows.
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2088
   * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
3a50d14d0   Andrey Ryabinin   mm: remove unused...
2089
   * on this LRU, maintained by the pageout code. An inactive_ratio
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2090
   * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
56e49d218   Rik van Riel   vmscan: evict use...
2091
   *
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
   * total     target    max
   * memory    ratio     inactive
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
56e49d218   Rik van Riel   vmscan: evict use...
2102
   */
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2103
  static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
56e49d218   Rik van Riel   vmscan: evict use...
2104
  {
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2105
  	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2106
2107
  	unsigned long inactive, active;
  	unsigned long inactive_ratio;
59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2108
  	unsigned long gb;
e3790144c   Johannes Weiner   mm: refactor inac...
2109

b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2110
2111
  	inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
  	active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
f8d1a3116   Mel Gorman   mm: consider whet...
2112

b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2113
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
4002570c5   Joonsoo Kim   mm/vmscan: restor...
2114
  	if (gb)
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2115
2116
2117
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
fd5388037   Michal Hocko   mm, vmscan: clean...
2118

59dc76b0d   Rik van Riel   mm: vmscan: reduc...
2119
  	return inactive * inactive_ratio < active;
b39415b27   Rik van Riel   vmscan: do not ev...
2120
  }
9a2651140   Johannes Weiner   mm: vmscan: clean...
2121
2122
2123
2124
2125
2126
  enum scan_balance {
  	SCAN_EQUAL,
  	SCAN_FRACT,
  	SCAN_ANON,
  	SCAN_FILE,
  };
4f98a2fee   Rik van Riel   vmscan: split LRU...
2127
2128
2129
2130
2131
2132
  /*
   * Determine how aggressively the anon and file LRU lists should be
   * scanned.  The relative value of each set of LRU lists is determined
   * by looking at the fraction of the pages scanned we did rotate back
   * onto the active list instead of evict.
   *
be7bd59db   Wanpeng Li   mm: fix page recl...
2133
2134
   * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
   * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
4f98a2fee   Rik van Riel   vmscan: split LRU...
2135
   */
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2136
2137
  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
  			   unsigned long *nr)
4f98a2fee   Rik van Riel   vmscan: split LRU...
2138
  {
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2139
  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
d483a5dd0   Johannes Weiner   mm: vmscan: limit...
2140
  	unsigned long anon_cost, file_cost, total_cost;
333776785   Vladimir Davydov   mm: vmscan: pass ...
2141
  	int swappiness = mem_cgroup_swappiness(memcg);
ed0173733   Yu Zhao   mm: use self-expl...
2142
  	u64 fraction[ANON_AND_FILE];
9a2651140   Johannes Weiner   mm: vmscan: clean...
2143
  	u64 denominator = 0;	/* gcc */
9a2651140   Johannes Weiner   mm: vmscan: clean...
2144
  	enum scan_balance scan_balance;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2145
  	unsigned long ap, fp;
4111304da   Hugh Dickins   mm: enum lru_list...
2146
  	enum lru_list lru;
76a33fc38   Shaohua Li   vmscan: prevent g...
2147
2148
  
  	/* If we have no swap space, do not bother scanning anon pages. */
d8b38438a   Vladimir Davydov   mm: vmscan: do no...
2149
  	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2150
  		scan_balance = SCAN_FILE;
76a33fc38   Shaohua Li   vmscan: prevent g...
2151
2152
  		goto out;
  	}
4f98a2fee   Rik van Riel   vmscan: split LRU...
2153

10316b313   Johannes Weiner   mm: vmscan: clari...
2154
2155
2156
2157
2158
2159
2160
  	/*
  	 * Global reclaim will swap to prevent OOM even with no
  	 * swappiness, but memcg users want to use this knob to
  	 * disable swapping for individual groups completely when
  	 * using the memory controller's swap limit feature would be
  	 * too expensive.
  	 */
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
2161
  	if (cgroup_reclaim(sc) && !swappiness) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2162
  		scan_balance = SCAN_FILE;
10316b313   Johannes Weiner   mm: vmscan: clari...
2163
2164
2165
2166
2167
2168
2169
2170
  		goto out;
  	}
  
  	/*
  	 * Do not apply any pressure balancing cleverness when the
  	 * system is close to OOM, scan both anon and file equally
  	 * (unless the swappiness setting disagrees with swapping).
  	 */
02695175c   Johannes Weiner   mm: vmscan: move ...
2171
  	if (!sc->priority && swappiness) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2172
  		scan_balance = SCAN_EQUAL;
10316b313   Johannes Weiner   mm: vmscan: clari...
2173
2174
  		goto out;
  	}
11d16c25b   Johannes Weiner   mm: vmscan: impro...
2175
  	/*
53138cea7   Johannes Weiner   mm: vmscan: move ...
2176
  	 * If the system is almost out of file pages, force-scan anon.
623762517   Johannes Weiner   revert "mm: vmsca...
2177
  	 */
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2178
  	if (sc->file_is_tiny) {
53138cea7   Johannes Weiner   mm: vmscan: move ...
2179
2180
  		scan_balance = SCAN_ANON;
  		goto out;
623762517   Johannes Weiner   revert "mm: vmsca...
2181
2182
2183
  	}
  
  	/*
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2184
2185
  	 * If there is enough inactive page cache, we do not reclaim
  	 * anything from the anonymous working right now.
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2186
  	 */
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2187
  	if (sc->cache_trim_mode) {
9a2651140   Johannes Weiner   mm: vmscan: clean...
2188
  		scan_balance = SCAN_FILE;
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2189
2190
  		goto out;
  	}
9a2651140   Johannes Weiner   mm: vmscan: clean...
2191
  	scan_balance = SCAN_FRACT;
7c5bd705d   Johannes Weiner   mm: memcg: only e...
2192
  	/*
314b57fb0   Johannes Weiner   mm: balance LRU l...
2193
2194
2195
2196
2197
2198
2199
2200
  	 * Calculate the pressure balance between anon and file pages.
  	 *
  	 * The amount of pressure we put on each LRU is inversely
  	 * proportional to the cost of reclaiming each list, as
  	 * determined by the share of pages that are refaulting, times
  	 * the relative IO cost of bringing back a swapped out
  	 * anonymous page vs reloading a filesystem page (swappiness).
  	 *
d483a5dd0   Johannes Weiner   mm: vmscan: limit...
2201
2202
2203
2204
  	 * Although we limit that influence to ensure no list gets
  	 * left behind completely: at least a third of the pressure is
  	 * applied, before swappiness.
  	 *
314b57fb0   Johannes Weiner   mm: balance LRU l...
2205
  	 * With swappiness at 100, anon and file have equal IO cost.
58c37f6e0   KOSAKI Motohiro   vmscan: protect r...
2206
  	 */
d483a5dd0   Johannes Weiner   mm: vmscan: limit...
2207
2208
2209
2210
  	total_cost = sc->anon_cost + sc->file_cost;
  	anon_cost = total_cost + sc->anon_cost;
  	file_cost = total_cost + sc->file_cost;
  	total_cost = anon_cost + file_cost;
58c37f6e0   KOSAKI Motohiro   vmscan: protect r...
2211

d483a5dd0   Johannes Weiner   mm: vmscan: limit...
2212
2213
  	ap = swappiness * (total_cost + 1);
  	ap /= anon_cost + 1;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2214

d483a5dd0   Johannes Weiner   mm: vmscan: limit...
2215
2216
  	fp = (200 - swappiness) * (total_cost + 1);
  	fp /= file_cost + 1;
4f98a2fee   Rik van Riel   vmscan: split LRU...
2217

76a33fc38   Shaohua Li   vmscan: prevent g...
2218
2219
  	fraction[0] = ap;
  	fraction[1] = fp;
a4fe1631f   Johannes Weiner   mm: vmscan: drop ...
2220
  	denominator = ap + fp;
76a33fc38   Shaohua Li   vmscan: prevent g...
2221
  out:
688035f72   Johannes Weiner   mm: don't avoid h...
2222
2223
  	for_each_evictable_lru(lru) {
  		int file = is_file_lru(lru);
9783aa991   Chris Down   mm, memcg: propor...
2224
  		unsigned long lruvec_size;
688035f72   Johannes Weiner   mm: don't avoid h...
2225
  		unsigned long scan;
1bc63fb12   Chris Down   mm, memcg: make s...
2226
  		unsigned long protection;
9783aa991   Chris Down   mm, memcg: propor...
2227
2228
  
  		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
22f7496f0   Yafang Shao   mm, memcg: avoid ...
2229
2230
  		protection = mem_cgroup_protection(sc->target_mem_cgroup,
  						   memcg,
1bc63fb12   Chris Down   mm, memcg: make s...
2231
  						   sc->memcg_low_reclaim);
9783aa991   Chris Down   mm, memcg: propor...
2232

1bc63fb12   Chris Down   mm, memcg: make s...
2233
  		if (protection) {
9783aa991   Chris Down   mm, memcg: propor...
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
  			/*
  			 * Scale a cgroup's reclaim pressure by proportioning
  			 * its current usage to its memory.low or memory.min
  			 * setting.
  			 *
  			 * This is important, as otherwise scanning aggression
  			 * becomes extremely binary -- from nothing as we
  			 * approach the memory protection threshold, to totally
  			 * nominal as we exceed it.  This results in requiring
  			 * setting extremely liberal protection thresholds. It
  			 * also means we simply get no protection at all if we
  			 * set it too low, which is not ideal.
1bc63fb12   Chris Down   mm, memcg: make s...
2246
2247
2248
2249
  			 *
  			 * If there is any protection in place, we reduce scan
  			 * pressure by how much of the total memory used is
  			 * within protection thresholds.
9783aa991   Chris Down   mm, memcg: propor...
2250
  			 *
9de7ca46a   Chris Down   mm, memcg: make m...
2251
2252
2253
2254
2255
2256
2257
2258
  			 * There is one special case: in the first reclaim pass,
  			 * we skip over all groups that are within their low
  			 * protection. If that fails to reclaim enough pages to
  			 * satisfy the reclaim goal, we come back and override
  			 * the best-effort low protection. However, we still
  			 * ideally want to honor how well-behaved groups are in
  			 * that case instead of simply punishing them all
  			 * equally. As such, we reclaim them based on how much
1bc63fb12   Chris Down   mm, memcg: make s...
2259
2260
2261
  			 * memory they are using, reducing the scan pressure
  			 * again by how much of the total memory used is under
  			 * hard protection.
9783aa991   Chris Down   mm, memcg: propor...
2262
  			 */
1bc63fb12   Chris Down   mm, memcg: make s...
2263
2264
2265
2266
2267
2268
2269
  			unsigned long cgroup_size = mem_cgroup_size(memcg);
  
  			/* Avoid TOCTOU with earlier protection check */
  			cgroup_size = max(cgroup_size, protection);
  
  			scan = lruvec_size - lruvec_size * protection /
  				cgroup_size;
9783aa991   Chris Down   mm, memcg: propor...
2270
2271
  
  			/*
1bc63fb12   Chris Down   mm, memcg: make s...
2272
  			 * Minimally target SWAP_CLUSTER_MAX pages to keep
55b65a57c   Ethon Paul   mm/vmsan: fix som...
2273
  			 * reclaim moving forwards, avoiding decrementing
9de7ca46a   Chris Down   mm, memcg: make m...
2274
  			 * sc->priority further than desirable.
9783aa991   Chris Down   mm, memcg: propor...
2275
  			 */
1bc63fb12   Chris Down   mm, memcg: make s...
2276
  			scan = max(scan, SWAP_CLUSTER_MAX);
9783aa991   Chris Down   mm, memcg: propor...
2277
2278
2279
2280
2281
  		} else {
  			scan = lruvec_size;
  		}
  
  		scan >>= sc->priority;
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2282

688035f72   Johannes Weiner   mm: don't avoid h...
2283
2284
2285
2286
2287
  		/*
  		 * If the cgroup's already been deleted, make sure to
  		 * scrape out the remaining cache.
  		 */
  		if (!scan && !mem_cgroup_online(memcg))
9783aa991   Chris Down   mm, memcg: propor...
2288
  			scan = min(lruvec_size, SWAP_CLUSTER_MAX);
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2289

688035f72   Johannes Weiner   mm: don't avoid h...
2290
2291
2292
2293
2294
  		switch (scan_balance) {
  		case SCAN_EQUAL:
  			/* Scan lists relative to size */
  			break;
  		case SCAN_FRACT:
9a2651140   Johannes Weiner   mm: vmscan: clean...
2295
  			/*
688035f72   Johannes Weiner   mm: don't avoid h...
2296
2297
  			 * Scan types proportional to swappiness and
  			 * their relative recent reclaim efficiency.
76073c646   Gavin Shan   mm/vmscan.c: don'...
2298
2299
2300
  			 * Make sure we don't miss the last page on
  			 * the offlined memory cgroups because of a
  			 * round-off error.
9a2651140   Johannes Weiner   mm: vmscan: clean...
2301
  			 */
76073c646   Gavin Shan   mm/vmscan.c: don'...
2302
2303
2304
  			scan = mem_cgroup_online(memcg) ?
  			       div64_u64(scan * fraction[file], denominator) :
  			       DIV64_U64_ROUND_UP(scan * fraction[file],
68600f623   Roman Gushchin   mm: don't miss th...
2305
  						  denominator);
688035f72   Johannes Weiner   mm: don't avoid h...
2306
2307
2308
2309
  			break;
  		case SCAN_FILE:
  		case SCAN_ANON:
  			/* Scan one type exclusively */
e072bff60   Mateusz Nosek   mm/vmscan.c: clea...
2310
  			if ((scan_balance == SCAN_FILE) != file)
688035f72   Johannes Weiner   mm: don't avoid h...
2311
  				scan = 0;
688035f72   Johannes Weiner   mm: don't avoid h...
2312
2313
2314
2315
  			break;
  		default:
  			/* Look ma, no brain */
  			BUG();
9a2651140   Johannes Weiner   mm: vmscan: clean...
2316
  		}
688035f72   Johannes Weiner   mm: don't avoid h...
2317

688035f72   Johannes Weiner   mm: don't avoid h...
2318
  		nr[lru] = scan;
76a33fc38   Shaohua Li   vmscan: prevent g...
2319
  	}
6e08a369e   Wu Fengguang   vmscan: cleanup t...
2320
  }
4f98a2fee   Rik van Riel   vmscan: split LRU...
2321

afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2322
  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2323
2324
  {
  	unsigned long nr[NR_LRU_LISTS];
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2325
  	unsigned long targets[NR_LRU_LISTS];
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2326
2327
2328
2329
2330
  	unsigned long nr_to_scan;
  	enum lru_list lru;
  	unsigned long nr_reclaimed = 0;
  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
  	struct blk_plug plug;
1a501907b   Mel Gorman   mm: vmscan: use p...
2331
  	bool scan_adjusted;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2332

afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2333
  	get_scan_count(lruvec, sc, nr);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2334

e82e0561d   Mel Gorman   mm: vmscan: obey ...
2335
2336
  	/* Record the original scan target for proportional adjustments later */
  	memcpy(targets, nr, sizeof(nr));
1a501907b   Mel Gorman   mm: vmscan: use p...
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
  	/*
  	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
  	 * event that can occur when there is little memory pressure e.g.
  	 * multiple streaming readers/writers. Hence, we do not abort scanning
  	 * when the requested number of pages are reclaimed when scanning at
  	 * DEF_PRIORITY on the assumption that the fact we are direct
  	 * reclaiming implies that kswapd is not keeping up and it is best to
  	 * do a batch of work at once. For memcg reclaim one check is made to
  	 * abort proportional reclaim if either the file or anon lru has already
  	 * dropped to zero at the first pass.
  	 */
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
2348
  	scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
1a501907b   Mel Gorman   mm: vmscan: use p...
2349
  			 sc->priority == DEF_PRIORITY);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2350
2351
2352
  	blk_start_plug(&plug);
  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
  					nr[LRU_INACTIVE_FILE]) {
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2353
2354
  		unsigned long nr_anon, nr_file, percentage;
  		unsigned long nr_scanned;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2355
2356
2357
2358
2359
2360
  		for_each_evictable_lru(lru) {
  			if (nr[lru]) {
  				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
  				nr[lru] -= nr_to_scan;
  
  				nr_reclaimed += shrink_list(lru, nr_to_scan,
3b991208b   Johannes Weiner   mm: fix inactive ...
2361
  							    lruvec, sc);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2362
2363
  			}
  		}
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2364

bd041733c   Michal Hocko   mm, vmscan: add c...
2365
  		cond_resched();
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2366
2367
  		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
  			continue;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2368
  		/*
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2369
  		 * For kswapd and memcg, reclaim at least the number of pages
1a501907b   Mel Gorman   mm: vmscan: use p...
2370
  		 * requested. Ensure that the anon and file LRUs are scanned
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2371
2372
2373
2374
2375
2376
  		 * proportionally what was requested by get_scan_count(). We
  		 * stop reclaiming one LRU and reduce the amount scanning
  		 * proportional to the original scan target.
  		 */
  		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
  		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
1a501907b   Mel Gorman   mm: vmscan: use p...
2377
2378
2379
2380
2381
2382
2383
2384
  		/*
  		 * It's just vindictive to attack the larger once the smaller
  		 * has gone to zero.  And given the way we stop scanning the
  		 * smaller below, this makes sure that we only make one nudge
  		 * towards proportionality once we've got nr_to_reclaim.
  		 */
  		if (!nr_file || !nr_anon)
  			break;
e82e0561d   Mel Gorman   mm: vmscan: obey ...
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
  		if (nr_file > nr_anon) {
  			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
  						targets[LRU_ACTIVE_ANON] + 1;
  			lru = LRU_BASE;
  			percentage = nr_anon * 100 / scan_target;
  		} else {
  			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
  						targets[LRU_ACTIVE_FILE] + 1;
  			lru = LRU_FILE;
  			percentage = nr_file * 100 / scan_target;
  		}
  
  		/* Stop scanning the smaller of the LRU */
  		nr[lru] = 0;
  		nr[lru + LRU_ACTIVE] = 0;
  
  		/*
  		 * Recalculate the other LRU scan count based on its original
  		 * scan target and the percentage scanning already complete
  		 */
  		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
  		nr_scanned = targets[lru] - nr[lru];
  		nr[lru] = targets[lru] * (100 - percentage) / 100;
  		nr[lru] -= min(nr[lru], nr_scanned);
  
  		lru += LRU_ACTIVE;
  		nr_scanned = targets[lru] - nr[lru];
  		nr[lru] = targets[lru] * (100 - percentage) / 100;
  		nr[lru] -= min(nr[lru], nr_scanned);
  
  		scan_adjusted = true;
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2416
2417
2418
2419
2420
2421
2422
2423
  	}
  	blk_finish_plug(&plug);
  	sc->nr_reclaimed += nr_reclaimed;
  
  	/*
  	 * Even if we did not try to evict anon pages at all, we want to
  	 * rebalance the anon lru active/inactive ratio.
  	 */
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2424
  	if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2425
2426
  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  				   sc, LRU_ACTIVE_ANON);
9b4f98cda   Johannes Weiner   mm: vmscan: compa...
2427
  }
23b9da55c   Mel Gorman   mm: vmscan: remov...
2428
  /* Use reclaim/compaction for costly allocs or under memory pressure */
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2429
  static bool in_reclaim_compaction(struct scan_control *sc)
23b9da55c   Mel Gorman   mm: vmscan: remov...
2430
  {
d84da3f9e   Kirill A. Shutemov   mm: use IS_ENABLE...
2431
  	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
23b9da55c   Mel Gorman   mm: vmscan: remov...
2432
  			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2433
  			 sc->priority < DEF_PRIORITY - 2))
23b9da55c   Mel Gorman   mm: vmscan: remov...
2434
2435
2436
2437
  		return true;
  
  	return false;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2438
  /*
23b9da55c   Mel Gorman   mm: vmscan: remov...
2439
2440
2441
   * Reclaim/compaction is used for high-order allocation requests. It reclaims
   * order-0 pages before compacting the zone. should_continue_reclaim() returns
   * true if more pages should be reclaimed such that when the page allocator
df3a45f9d   Qiwu Chen   mm/vmscan: update...
2442
   * calls try_to_compact_pages() that it will have enough free pages to succeed.
23b9da55c   Mel Gorman   mm: vmscan: remov...
2443
   * It will give up earlier than that if there is difficulty reclaiming pages.
3e7d34497   Mel Gorman   mm: vmscan: recla...
2444
   */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2445
  static inline bool should_continue_reclaim(struct pglist_data *pgdat,
3e7d34497   Mel Gorman   mm: vmscan: recla...
2446
  					unsigned long nr_reclaimed,
3e7d34497   Mel Gorman   mm: vmscan: recla...
2447
2448
2449
2450
  					struct scan_control *sc)
  {
  	unsigned long pages_for_compaction;
  	unsigned long inactive_lru_pages;
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2451
  	int z;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2452
2453
  
  	/* If not in reclaim/compaction mode, stop */
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2454
  	if (!in_reclaim_compaction(sc))
3e7d34497   Mel Gorman   mm: vmscan: recla...
2455
  		return false;
5ee04716c   Vlastimil Babka   mm, reclaim: clea...
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
  	/*
  	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
  	 * number of pages that were scanned. This will return to the caller
  	 * with the risk reclaim/compaction and the resulting allocation attempt
  	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
  	 * allocations through requiring that the full LRU list has been scanned
  	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
  	 * scan, but that approximation was wrong, and there were corner cases
  	 * where always a non-zero amount of pages were scanned.
  	 */
  	if (!nr_reclaimed)
  		return false;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2468

3e7d34497   Mel Gorman   mm: vmscan: recla...
2469
  	/* If compaction would go ahead or the allocation would succeed, stop */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2470
2471
  	for (z = 0; z <= sc->reclaim_idx; z++) {
  		struct zone *zone = &pgdat->node_zones[z];
6aa303def   Mel Gorman   mm, vmscan: only ...
2472
  		if (!managed_zone(zone))
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2473
2474
2475
  			continue;
  
  		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
cf378319d   Vlastimil Babka   mm, compaction: r...
2476
  		case COMPACT_SUCCESS:
a9dd0a831   Mel Gorman   mm, vmscan: make ...
2477
2478
2479
2480
2481
2482
  		case COMPACT_CONTINUE:
  			return false;
  		default:
  			/* check next zone */
  			;
  		}
3e7d34497   Mel Gorman   mm: vmscan: recla...
2483
  	}
1c6c15971   Hillf Danton   mm, reclaim: make...
2484
2485
2486
2487
2488
2489
2490
2491
2492
  
  	/*
  	 * If we have not reclaimed enough pages for compaction and the
  	 * inactive lists are large enough, continue reclaiming
  	 */
  	pages_for_compaction = compact_gap(sc->order);
  	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
  	if (get_nr_swap_pages() > 0)
  		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
5ee04716c   Vlastimil Babka   mm, reclaim: clea...
2493
  	return inactive_lru_pages > pages_for_compaction;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2494
  }
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2495
  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2496
  {
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2497
  	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
d2af33970   Johannes Weiner   mm: vmscan: repla...
2498
  	struct mem_cgroup *memcg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2499

0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2500
  	memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
d2af33970   Johannes Weiner   mm: vmscan: repla...
2501
  	do {
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2502
  		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
d2af33970   Johannes Weiner   mm: vmscan: repla...
2503
2504
  		unsigned long reclaimed;
  		unsigned long scanned;
5660048cc   Johannes Weiner   mm: move memcg hi...
2505

e3336cab2   Xunlei Pang   mm: memcg: fix me...
2506
2507
2508
2509
2510
2511
2512
  		/*
  		 * This loop can become CPU-bound when target memcgs
  		 * aren't eligible for reclaim - either because they
  		 * don't have any reclaimable pages, or because their
  		 * memory is explicitly protected. Avoid soft lockups.
  		 */
  		cond_resched();
45c7f7e1e   Chris Down   mm, memcg: decoup...
2513
2514
2515
  		mem_cgroup_calculate_protection(target_memcg, memcg);
  
  		if (mem_cgroup_below_min(memcg)) {
d2af33970   Johannes Weiner   mm: vmscan: repla...
2516
2517
2518
2519
2520
  			/*
  			 * Hard protection.
  			 * If there is no reclaimable memory, OOM.
  			 */
  			continue;
45c7f7e1e   Chris Down   mm, memcg: decoup...
2521
  		} else if (mem_cgroup_below_low(memcg)) {
d2af33970   Johannes Weiner   mm: vmscan: repla...
2522
2523
2524
2525
2526
2527
2528
2529
  			/*
  			 * Soft protection.
  			 * Respect the protection only as long as
  			 * there is an unprotected supply
  			 * of reclaimable memory from other cgroups.
  			 */
  			if (!sc->memcg_low_reclaim) {
  				sc->memcg_low_skipped = 1;
bf8d5d52f   Roman Gushchin   memcg: introduce ...
2530
  				continue;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2531
  			}
d2af33970   Johannes Weiner   mm: vmscan: repla...
2532
  			memcg_memory_event(memcg, MEMCG_LOW);
d2af33970   Johannes Weiner   mm: vmscan: repla...
2533
  		}
241994ed8   Johannes Weiner   mm: memcontrol: d...
2534

d2af33970   Johannes Weiner   mm: vmscan: repla...
2535
2536
  		reclaimed = sc->nr_reclaimed;
  		scanned = sc->nr_scanned;
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
2537
2538
  
  		shrink_lruvec(lruvec, sc);
70ddf637e   Anton Vorontsov   memcg: add memory...
2539

d2af33970   Johannes Weiner   mm: vmscan: repla...
2540
2541
  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
  			    sc->priority);
6b4f7799c   Johannes Weiner   mm: vmscan: invok...
2542

d2af33970   Johannes Weiner   mm: vmscan: repla...
2543
2544
2545
2546
  		/* Record the group's reclaim efficiency */
  		vmpressure(sc->gfp_mask, memcg, false,
  			   sc->nr_scanned - scanned,
  			   sc->nr_reclaimed - reclaimed);
70ddf637e   Anton Vorontsov   memcg: add memory...
2547

0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2548
2549
  	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
  }
6c9e0907f   Liu Song   mm/vmscan.c: remo...
2550
  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2551
2552
  {
  	struct reclaim_state *reclaim_state = current->reclaim_state;
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2553
  	unsigned long nr_reclaimed, nr_scanned;
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2554
  	struct lruvec *target_lruvec;
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2555
  	bool reclaimable = false;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2556
  	unsigned long file;
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2557

1b05117df   Johannes Weiner   mm: vmscan: harmo...
2558
  	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2559
2560
2561
2562
2563
  again:
  	memset(&sc->nr, 0, sizeof(sc->nr));
  
  	nr_reclaimed = sc->nr_reclaimed;
  	nr_scanned = sc->nr_scanned;
53138cea7   Johannes Weiner   mm: vmscan: move ...
2564
  	/*
7cf111bc3   Johannes Weiner   mm: vmscan: deter...
2565
2566
2567
2568
2569
2570
2571
2572
  	 * Determine the scan balance between anon and file LRUs.
  	 */
  	spin_lock_irq(&pgdat->lru_lock);
  	sc->anon_cost = target_lruvec->anon_cost;
  	sc->file_cost = target_lruvec->file_cost;
  	spin_unlock_irq(&pgdat->lru_lock);
  
  	/*
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2573
2574
2575
2576
2577
  	 * Target desirable inactive:active list ratios for the anon
  	 * and file LRU lists.
  	 */
  	if (!sc->force_deactivate) {
  		unsigned long refaults;
170b04b7a   Joonsoo Kim   mm/workingset: pr...
2578
2579
2580
2581
  		refaults = lruvec_page_state(target_lruvec,
  				WORKINGSET_ACTIVATE_ANON);
  		if (refaults != target_lruvec->refaults[0] ||
  			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
  			sc->may_deactivate |= DEACTIVATE_ANON;
  		else
  			sc->may_deactivate &= ~DEACTIVATE_ANON;
  
  		/*
  		 * When refaults are being observed, it means a new
  		 * workingset is being established. Deactivate to get
  		 * rid of any stale active pages quickly.
  		 */
  		refaults = lruvec_page_state(target_lruvec,
170b04b7a   Joonsoo Kim   mm/workingset: pr...
2592
2593
  				WORKINGSET_ACTIVATE_FILE);
  		if (refaults != target_lruvec->refaults[1] ||
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
  		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
  			sc->may_deactivate |= DEACTIVATE_FILE;
  		else
  			sc->may_deactivate &= ~DEACTIVATE_FILE;
  	} else
  		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
  
  	/*
  	 * If we have plenty of inactive file pages that aren't
  	 * thrashing, try to reclaim those first before touching
  	 * anonymous pages.
  	 */
  	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
  	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
  		sc->cache_trim_mode = 1;
  	else
  		sc->cache_trim_mode = 0;
  
  	/*
53138cea7   Johannes Weiner   mm: vmscan: move ...
2613
2614
2615
2616
2617
2618
2619
2620
2621
  	 * Prevent the reclaimer from falling into the cache trap: as
  	 * cache pages start out inactive, every cache fault will tip
  	 * the scan balance towards the file LRU.  And as the file LRU
  	 * shrinks, so does the window for rotation from references.
  	 * This means we have a runaway feedback loop where a tiny
  	 * thrashing file LRU becomes infinitely more attractive than
  	 * anon pages.  Try to detect this based on file LRU size.
  	 */
  	if (!cgroup_reclaim(sc)) {
53138cea7   Johannes Weiner   mm: vmscan: move ...
2622
  		unsigned long total_high_wmark = 0;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2623
2624
  		unsigned long free, anon;
  		int z;
53138cea7   Johannes Weiner   mm: vmscan: move ...
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
  
  		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
  		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
  			   node_page_state(pgdat, NR_INACTIVE_FILE);
  
  		for (z = 0; z < MAX_NR_ZONES; z++) {
  			struct zone *zone = &pgdat->node_zones[z];
  			if (!managed_zone(zone))
  				continue;
  
  			total_high_wmark += high_wmark_pages(zone);
  		}
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
  		/*
  		 * Consider anon: if that's low too, this isn't a
  		 * runaway file reclaim problem, but rather just
  		 * extreme pressure. Reclaim as per usual then.
  		 */
  		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
  
  		sc->file_is_tiny =
  			file + free <= total_high_wmark &&
  			!(sc->may_deactivate & DEACTIVATE_ANON) &&
  			anon >> sc->priority;
53138cea7   Johannes Weiner   mm: vmscan: move ...
2648
  	}
0f6a5cff4   Johannes Weiner   mm: vmscan: split...
2649
  	shrink_node_memcgs(pgdat, sc);
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2650

d2af33970   Johannes Weiner   mm: vmscan: repla...
2651
2652
2653
2654
  	if (reclaim_state) {
  		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
  		reclaim_state->reclaimed_slab = 0;
  	}
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2655

d2af33970   Johannes Weiner   mm: vmscan: repla...
2656
  	/* Record the subtree's reclaim efficiency */
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2657
  	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
d2af33970   Johannes Weiner   mm: vmscan: repla...
2658
2659
  		   sc->nr_scanned - nr_scanned,
  		   sc->nr_reclaimed - nr_reclaimed);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2660

d2af33970   Johannes Weiner   mm: vmscan: repla...
2661
2662
  	if (sc->nr_reclaimed - nr_reclaimed)
  		reclaimable = true;
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2663

d2af33970   Johannes Weiner   mm: vmscan: repla...
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
  	if (current_is_kswapd()) {
  		/*
  		 * If reclaim is isolating dirty pages under writeback,
  		 * it implies that the long-lived page allocation rate
  		 * is exceeding the page laundering rate. Either the
  		 * global limits are not being effective at throttling
  		 * processes due to the page distribution throughout
  		 * zones or there is heavy usage of a slow backing
  		 * device. The only option is to throttle from reclaim
  		 * context which is not ideal as there is no guarantee
  		 * the dirtying process is throttled in the same way
  		 * balance_dirty_pages() manages.
  		 *
  		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
  		 * count the number of pages under pages flagged for
  		 * immediate reclaim and stall if any are encountered
  		 * in the nr_immediate check below.
  		 */
  		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
  			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2684

d2af33970   Johannes Weiner   mm: vmscan: repla...
2685
2686
2687
  		/* Allow kswapd to start writing pages during reclaim.*/
  		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
  			set_bit(PGDAT_DIRTY, &pgdat->flags);
e3c1ac586   Andrey Ryabinin   mm/vmscan: don't ...
2688
2689
  
  		/*
1eba09c15   Randy Dunlap   mm/vmscan.c: dele...
2690
  		 * If kswapd scans pages marked for immediate
d2af33970   Johannes Weiner   mm: vmscan: repla...
2691
2692
2693
  		 * reclaim and under writeback (nr_immediate), it
  		 * implies that pages are cycling through the LRU
  		 * faster than they are written so also forcibly stall.
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2694
  		 */
d2af33970   Johannes Weiner   mm: vmscan: repla...
2695
2696
2697
2698
2699
  		if (sc->nr.immediate)
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
  	}
  
  	/*
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2700
2701
2702
2703
  	 * Tag a node/memcg as congested if all the dirty pages
  	 * scanned were backed by a congested BDI and
  	 * wait_iff_congested will stall.
  	 *
d2af33970   Johannes Weiner   mm: vmscan: repla...
2704
2705
2706
  	 * Legacy memcg will stall in page writeback so avoid forcibly
  	 * stalling in wait_iff_congested().
  	 */
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2707
2708
  	if ((current_is_kswapd() ||
  	     (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
d2af33970   Johannes Weiner   mm: vmscan: repla...
2709
  	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2710
  		set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
d2af33970   Johannes Weiner   mm: vmscan: repla...
2711
2712
2713
2714
2715
2716
2717
  
  	/*
  	 * Stall direct reclaim for IO completions if underlying BDIs
  	 * and node is congested. Allow kswapd to continue until it
  	 * starts encountering unqueued dirty pages or cycling through
  	 * the LRU too quickly.
  	 */
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2718
2719
2720
  	if (!current_is_kswapd() && current_may_throttle() &&
  	    !sc->hibernation_mode &&
  	    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
d2af33970   Johannes Weiner   mm: vmscan: repla...
2721
  		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
d108c7721   Andrey Ryabinin   mm/vmscan: don't ...
2722

d2af33970   Johannes Weiner   mm: vmscan: repla...
2723
2724
2725
  	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
  				    sc))
  		goto again;
2344d7e44   Johannes Weiner   mm: vmscan: remov...
2726

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2727
2728
2729
2730
2731
2732
2733
2734
  	/*
  	 * Kswapd gives up on balancing particular nodes after too
  	 * many failures to reclaim anything from them and goes to
  	 * sleep. On reclaim progress, reset the failure counter. A
  	 * successful direct reclaim run will revive a dormant kswapd.
  	 */
  	if (reclaimable)
  		pgdat->kswapd_failures = 0;
f16015fbf   Johannes Weiner   mm: vmscan: disti...
2735
  }
53853e2d2   Vlastimil Babka   mm, compaction: d...
2736
  /*
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2737
2738
2739
   * Returns true if compaction should go ahead for a costly-order request, or
   * the allocation would already succeed without compaction. Return false if we
   * should reclaim first.
53853e2d2   Vlastimil Babka   mm, compaction: d...
2740
   */
4f588331b   Mel Gorman   mm, vmscan: avoid...
2741
  static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2742
  {
31483b6ad   Mel Gorman   mm, vmscan: remov...
2743
  	unsigned long watermark;
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2744
  	enum compact_result suitable;
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2745

fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2746
2747
2748
2749
2750
2751
2752
  	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
  	if (suitable == COMPACT_SUCCESS)
  		/* Allocation should succeed already. Don't reclaim. */
  		return true;
  	if (suitable == COMPACT_SKIPPED)
  		/* Compaction cannot yet proceed. Do reclaim. */
  		return false;
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2753

53853e2d2   Vlastimil Babka   mm, compaction: d...
2754
  	/*
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2755
2756
2757
2758
2759
2760
2761
  	 * Compaction is already possible, but it takes time to run and there
  	 * are potentially other callers using the pages just freed. So proceed
  	 * with reclaim to make a buffer of free pages available to give
  	 * compaction a reasonable chance of completing and allocating the page.
  	 * Note that we won't actually reclaim the whole buffer in one attempt
  	 * as the target watermark in should_continue_reclaim() is lower. But if
  	 * we are already above the high+gap watermark, don't reclaim at all.
53853e2d2   Vlastimil Babka   mm, compaction: d...
2762
  	 */
fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2763
  	watermark = high_wmark_pages(zone) + compact_gap(sc->order);
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2764

fdd4c6149   Vlastimil Babka   mm, vmscan: make ...
2765
  	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
fe4b1b244   Mel Gorman   mm: vmscan: when ...
2766
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2767
2768
2769
2770
2771
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
   * request.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2772
2773
2774
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
   */
0a0337e0d   Michal Hocko   mm, oom: rework o...
2775
  static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2776
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2777
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2778
  	struct zone *zone;
0608f43da   Andrew Morton   revert "memcg, vm...
2779
2780
  	unsigned long nr_soft_reclaimed;
  	unsigned long nr_soft_scanned;
619d0d76c   Weijie Yang   mm/vmscan: restor...
2781
  	gfp_t orig_mask;
79dafcdca   Mel Gorman   mm, vmscan: by de...
2782
  	pg_data_t *last_pgdat = NULL;
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2783

cc715d99e   Mel Gorman   mm: vmscan: forci...
2784
2785
2786
2787
2788
  	/*
  	 * If the number of buffer_heads in the machine exceeds the maximum
  	 * allowed level, force direct reclaim to scan the highmem zone as
  	 * highmem pages could be pinning lowmem pages storing buffer_heads
  	 */
619d0d76c   Weijie Yang   mm/vmscan: restor...
2789
  	orig_mask = sc->gfp_mask;
b2e18757f   Mel Gorman   mm, vmscan: begin...
2790
  	if (buffer_heads_over_limit) {
cc715d99e   Mel Gorman   mm: vmscan: forci...
2791
  		sc->gfp_mask |= __GFP_HIGHMEM;
4f588331b   Mel Gorman   mm, vmscan: avoid...
2792
  		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
b2e18757f   Mel Gorman   mm, vmscan: begin...
2793
  	}
cc715d99e   Mel Gorman   mm: vmscan: forci...
2794

d4debc66d   Mel Gorman   vmscan: remove un...
2795
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
b2e18757f   Mel Gorman   mm, vmscan: begin...
2796
  					sc->reclaim_idx, sc->nodemask) {
b2e18757f   Mel Gorman   mm, vmscan: begin...
2797
  		/*
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2798
2799
2800
  		 * Take care memory controller reclaiming has small influence
  		 * to global LRU.
  		 */
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
2801
  		if (!cgroup_reclaim(sc)) {
344736f29   Vladimir Davydov   cpuset: simplify ...
2802
2803
  			if (!cpuset_zone_allowed(zone,
  						 GFP_KERNEL | __GFP_HARDWALL))
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2804
  				continue;
65ec02cb9   Vladimir Davydov   mm: vmscan: move ...
2805

0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
  			/*
  			 * If we already have plenty of memory free for
  			 * compaction in this zone, don't free any more.
  			 * Even though compaction is invoked for any
  			 * non-zero order, only frequent costly order
  			 * reclamation is disruptive enough to become a
  			 * noticeable problem, like transparent huge
  			 * page allocations.
  			 */
  			if (IS_ENABLED(CONFIG_COMPACTION) &&
  			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
4f588331b   Mel Gorman   mm, vmscan: avoid...
2817
  			    compaction_ready(zone, sc)) {
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2818
2819
  				sc->compaction_ready = true;
  				continue;
e0887c19b   Rik van Riel   vmscan: limit dir...
2820
  			}
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2821

0608f43da   Andrew Morton   revert "memcg, vm...
2822
  			/*
79dafcdca   Mel Gorman   mm, vmscan: by de...
2823
2824
2825
2826
2827
2828
2829
2830
2831
  			 * Shrink each node in the zonelist once. If the
  			 * zonelist is ordered by zone (not the default) then a
  			 * node may be shrunk multiple times but in that case
  			 * the user prefers lower zones being preserved.
  			 */
  			if (zone->zone_pgdat == last_pgdat)
  				continue;
  
  			/*
0608f43da   Andrew Morton   revert "memcg, vm...
2832
2833
2834
2835
2836
2837
  			 * This steals pages from memory cgroups over softlimit
  			 * and returns the number of reclaimed pages and
  			 * scanned pages. This works for global memory pressure
  			 * and balancing, not for a memcg's limit.
  			 */
  			nr_soft_scanned = 0;
ef8f23279   Mel Gorman   mm, memcg: move m...
2838
  			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
0608f43da   Andrew Morton   revert "memcg, vm...
2839
2840
2841
2842
  						sc->order, sc->gfp_mask,
  						&nr_soft_scanned);
  			sc->nr_reclaimed += nr_soft_reclaimed;
  			sc->nr_scanned += nr_soft_scanned;
ac34a1a3c   KAMEZAWA Hiroyuki   memcg: fix direct...
2843
  			/* need some check for avoid more shrink_zone() */
1cfb419b3   KAMEZAWA Hiroyuki   per-zone and recl...
2844
  		}
408d85441   Nick Piggin   [PATCH] oom: use ...
2845

79dafcdca   Mel Gorman   mm, vmscan: by de...
2846
2847
2848
2849
  		/* See comment about same check for global reclaim above */
  		if (zone->zone_pgdat == last_pgdat)
  			continue;
  		last_pgdat = zone->zone_pgdat;
970a39a36   Mel Gorman   mm, vmscan: avoid...
2850
  		shrink_node(zone->zone_pgdat, sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2851
  	}
e0c23279c   Mel Gorman   vmscan: abort rec...
2852

65ec02cb9   Vladimir Davydov   mm: vmscan: move ...
2853
  	/*
619d0d76c   Weijie Yang   mm/vmscan: restor...
2854
2855
2856
2857
  	 * Restore to original mask to avoid the impact on the caller if we
  	 * promoted it to __GFP_HIGHMEM.
  	 */
  	sc->gfp_mask = orig_mask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2858
  }
4f98a2fee   Rik van Riel   vmscan: split LRU...
2859

b910718a9   Johannes Weiner   mm: vmscan: detec...
2860
  static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2861
  {
b910718a9   Johannes Weiner   mm: vmscan: detec...
2862
2863
  	struct lruvec *target_lruvec;
  	unsigned long refaults;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2864

b910718a9   Johannes Weiner   mm: vmscan: detec...
2865
  	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
170b04b7a   Joonsoo Kim   mm/workingset: pr...
2866
2867
2868
2869
  	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
  	target_lruvec->refaults[0] = refaults;
  	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
  	target_lruvec->refaults[1] = refaults;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2870
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2871
2872
2873
2874
2875
2876
2877
2878
  /*
   * This is the main entry point to direct page reclaim.
   *
   * If a full scan of the inactive list fails to free enough memory then we
   * are "out of memory" and something needs to be killed.
   *
   * If the caller is !__GFP_FS then the probability of a failure is reasonably
   * high - the zone may be full of dirty or under-writeback pages, which this
5b0830cb9   Jens Axboe   writeback: get ri...
2879
2880
2881
2882
   * caller can't do much about.  We kick the writeback threads and take explicit
   * naps in the hope that some of these pages can be written.  But if the
   * allocating task holds filesystem locks which prevent writeout this might not
   * work, and the allocation attempt will fail.
a41f24ea9   Nishanth Aravamudan   page allocator: s...
2883
2884
2885
   *
   * returns:	0, if no pages reclaimed
   * 		else, the number of pages reclaimed
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2886
   */
dac1d27bc   Mel Gorman   mm: use zonelists...
2887
  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3115cd914   Vladimir Davydov   mm: vmscan: remov...
2888
  					  struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2889
  {
241994ed8   Johannes Weiner   mm: memcontrol: d...
2890
  	int initial_priority = sc->priority;
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2891
2892
2893
  	pg_data_t *last_pgdat;
  	struct zoneref *z;
  	struct zone *zone;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2894
  retry:
873b47717   Keika Kobayashi   per-task-delay-ac...
2895
  	delayacct_freepages_start();
b5ead35e7   Johannes Weiner   mm: vmscan: namin...
2896
  	if (!cgroup_reclaim(sc))
7cc30fcfd   Mel Gorman   mm: vmstat: accou...
2897
  		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2898

9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
2899
  	do {
70ddf637e   Anton Vorontsov   memcg: add memory...
2900
2901
  		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
  				sc->priority);
66e1707bc   Balbir Singh   Memory controller...
2902
  		sc->nr_scanned = 0;
0a0337e0d   Michal Hocko   mm, oom: rework o...
2903
  		shrink_zones(zonelist, sc);
c6a8a8c58   KOSAKI Motohiro   vmscan: recalcula...
2904

bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2905
  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2906
2907
2908
2909
  			break;
  
  		if (sc->compaction_ready)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2910
2911
  
  		/*
0e50ce3b5   Minchan Kim   mm: use up free s...
2912
2913
2914
2915
2916
  		 * If we're getting trouble reclaiming, start doing
  		 * writepage even in laptop mode.
  		 */
  		if (sc->priority < DEF_PRIORITY - 2)
  			sc->may_writepage = 1;
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2917
  	} while (--sc->priority >= 0);
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2918

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2919
2920
2921
2922
2923
2924
  	last_pgdat = NULL;
  	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
  					sc->nodemask) {
  		if (zone->zone_pgdat == last_pgdat)
  			continue;
  		last_pgdat = zone->zone_pgdat;
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2925

2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2926
  		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
1b05117df   Johannes Weiner   mm: vmscan: harmo...
2927
2928
2929
2930
2931
2932
2933
2934
  
  		if (cgroup_reclaim(sc)) {
  			struct lruvec *lruvec;
  
  			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
  						   zone->zone_pgdat);
  			clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
  		}
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
2935
  	}
873b47717   Keika Kobayashi   per-task-delay-ac...
2936
  	delayacct_freepages_end();
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2937
2938
  	if (sc->nr_reclaimed)
  		return sc->nr_reclaimed;
0cee34fd7   Mel Gorman   mm: vmscan: check...
2939
  	/* Aborted reclaim to try compaction? don't OOM, then */
0b06496a3   Johannes Weiner   mm: vmscan: rewor...
2940
  	if (sc->compaction_ready)
7335084d4   Mel Gorman   mm: vmscan: do no...
2941
  		return 1;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
  	/*
  	 * We make inactive:active ratio decisions based on the node's
  	 * composition of memory, but a restrictive reclaim_idx or a
  	 * memory.low cgroup setting can exempt large amounts of
  	 * memory from reclaim. Neither of which are very common, so
  	 * instead of doing costly eligibility calculations of the
  	 * entire cgroup subtree up front, we assume the estimates are
  	 * good, and retry with forcible deactivation if that fails.
  	 */
  	if (sc->skipped_deactivate) {
  		sc->priority = initial_priority;
  		sc->force_deactivate = 1;
  		sc->skipped_deactivate = 0;
  		goto retry;
  	}
241994ed8   Johannes Weiner   mm: memcontrol: d...
2957
  	/* Untapped cgroup reserves?  Don't OOM, retry. */
d6622f636   Yisheng Xie   mm/vmscan: more r...
2958
  	if (sc->memcg_low_skipped) {
241994ed8   Johannes Weiner   mm: memcontrol: d...
2959
  		sc->priority = initial_priority;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
2960
  		sc->force_deactivate = 0;
d6622f636   Yisheng Xie   mm/vmscan: more r...
2961
2962
  		sc->memcg_low_reclaim = 1;
  		sc->memcg_low_skipped = 0;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2963
2964
  		goto retry;
  	}
bb21c7ce1   KOSAKI Motohiro   vmscan: fix do_tr...
2965
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2966
  }
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2967
  static bool allow_direct_reclaim(pg_data_t *pgdat)
5515061d2   Mel Gorman   mm: throttle dire...
2968
2969
2970
2971
2972
2973
  {
  	struct zone *zone;
  	unsigned long pfmemalloc_reserve = 0;
  	unsigned long free_pages = 0;
  	int i;
  	bool wmark_ok;
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
2974
2975
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  		return true;
5515061d2   Mel Gorman   mm: throttle dire...
2976
2977
  	for (i = 0; i <= ZONE_NORMAL; i++) {
  		zone = &pgdat->node_zones[i];
d450abd81   Johannes Weiner   mm: fix check for...
2978
2979
2980
2981
  		if (!managed_zone(zone))
  			continue;
  
  		if (!zone_reclaimable_pages(zone))
675becce1   Mel Gorman   mm: vmscan: do no...
2982
  			continue;
5515061d2   Mel Gorman   mm: throttle dire...
2983
2984
2985
  		pfmemalloc_reserve += min_wmark_pages(zone);
  		free_pages += zone_page_state(zone, NR_FREE_PAGES);
  	}
675becce1   Mel Gorman   mm: vmscan: do no...
2986
2987
2988
  	/* If there are no reserves (unexpected config) then do not throttle */
  	if (!pfmemalloc_reserve)
  		return true;
5515061d2   Mel Gorman   mm: throttle dire...
2989
2990
2991
2992
  	wmark_ok = free_pages > pfmemalloc_reserve / 2;
  
  	/* kswapd must be awake if processes are being throttled */
  	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
97a225e69   Joonsoo Kim   mm/page_alloc: in...
2993
2994
  		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
  			WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
2995

5515061d2   Mel Gorman   mm: throttle dire...
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
  		wake_up_interruptible(&pgdat->kswapd_wait);
  	}
  
  	return wmark_ok;
  }
  
  /*
   * Throttle direct reclaimers if backing storage is backed by the network
   * and the PFMEMALLOC reserve for the preferred node is getting dangerously
   * depleted. kswapd will continue to make progress and wake the processes
50694c28f   Mel Gorman   mm: vmscan: check...
3006
3007
3008
3009
   * when the low watermark is reached.
   *
   * Returns true if a fatal signal was delivered during throttling. If this
   * happens, the page allocator should not consider triggering the OOM killer.
5515061d2   Mel Gorman   mm: throttle dire...
3010
   */
50694c28f   Mel Gorman   mm: vmscan: check...
3011
  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
5515061d2   Mel Gorman   mm: throttle dire...
3012
3013
  					nodemask_t *nodemask)
  {
675becce1   Mel Gorman   mm: vmscan: do no...
3014
  	struct zoneref *z;
5515061d2   Mel Gorman   mm: throttle dire...
3015
  	struct zone *zone;
675becce1   Mel Gorman   mm: vmscan: do no...
3016
  	pg_data_t *pgdat = NULL;
5515061d2   Mel Gorman   mm: throttle dire...
3017
3018
3019
3020
3021
3022
3023
3024
3025
  
  	/*
  	 * Kernel threads should not be throttled as they may be indirectly
  	 * responsible for cleaning pages necessary for reclaim to make forward
  	 * progress. kjournald for example may enter direct reclaim while
  	 * committing a transaction where throttling it could forcing other
  	 * processes to block on log_wait_commit().
  	 */
  	if (current->flags & PF_KTHREAD)
50694c28f   Mel Gorman   mm: vmscan: check...
3026
3027
3028
3029
3030
3031
3032
3033
  		goto out;
  
  	/*
  	 * If a fatal signal is pending, this process should not throttle.
  	 * It should return quickly so it can exit and free its memory
  	 */
  	if (fatal_signal_pending(current))
  		goto out;
5515061d2   Mel Gorman   mm: throttle dire...
3034

675becce1   Mel Gorman   mm: vmscan: do no...
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
  	/*
  	 * Check if the pfmemalloc reserves are ok by finding the first node
  	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
  	 * GFP_KERNEL will be required for allocating network buffers when
  	 * swapping over the network so ZONE_HIGHMEM is unusable.
  	 *
  	 * Throttling is based on the first usable node and throttled processes
  	 * wait on a queue until kswapd makes progress and wakes them. There
  	 * is an affinity then between processes waking up and where reclaim
  	 * progress has been made assuming the process wakes on the same node.
  	 * More importantly, processes running on remote nodes will not compete
  	 * for remote pfmemalloc reserves and processes on different nodes
  	 * should make reasonable progress.
  	 */
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
17636faad   Michael S. Tsirkin   mm/vmscan: fix hi...
3050
  					gfp_zone(gfp_mask), nodemask) {
675becce1   Mel Gorman   mm: vmscan: do no...
3051
3052
3053
3054
3055
  		if (zone_idx(zone) > ZONE_NORMAL)
  			continue;
  
  		/* Throttle based on the first usable node */
  		pgdat = zone->zone_pgdat;
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3056
  		if (allow_direct_reclaim(pgdat))
675becce1   Mel Gorman   mm: vmscan: do no...
3057
3058
3059
3060
3061
3062
  			goto out;
  		break;
  	}
  
  	/* If no zone was usable by the allocation flags then do not throttle */
  	if (!pgdat)
50694c28f   Mel Gorman   mm: vmscan: check...
3063
  		goto out;
5515061d2   Mel Gorman   mm: throttle dire...
3064

68243e76e   Mel Gorman   mm: account for t...
3065
3066
  	/* Account for the throttling */
  	count_vm_event(PGSCAN_DIRECT_THROTTLE);
5515061d2   Mel Gorman   mm: throttle dire...
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
  	/*
  	 * If the caller cannot enter the filesystem, it's possible that it
  	 * is due to the caller holding an FS lock or performing a journal
  	 * transaction in the case of a filesystem like ext[3|4]. In this case,
  	 * it is not safe to block on pfmemalloc_wait as kswapd could be
  	 * blocked waiting on the same lock. Instead, throttle for up to a
  	 * second before continuing.
  	 */
  	if (!(gfp_mask & __GFP_FS)) {
  		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3077
  			allow_direct_reclaim(pgdat), HZ);
50694c28f   Mel Gorman   mm: vmscan: check...
3078
3079
  
  		goto check_pending;
5515061d2   Mel Gorman   mm: throttle dire...
3080
3081
3082
3083
  	}
  
  	/* Throttle until kswapd wakes the process */
  	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3084
  		allow_direct_reclaim(pgdat));
50694c28f   Mel Gorman   mm: vmscan: check...
3085
3086
3087
3088
3089
3090
3091
  
  check_pending:
  	if (fatal_signal_pending(current))
  		return true;
  
  out:
  	return false;
5515061d2   Mel Gorman   mm: throttle dire...
3092
  }
dac1d27bc   Mel Gorman   mm: use zonelists...
3093
  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
327c0e968   KAMEZAWA Hiroyuki   vmscan: fix it to...
3094
  				gfp_t gfp_mask, nodemask_t *nodemask)
66e1707bc   Balbir Singh   Memory controller...
3095
  {
33906bc5c   Mel Gorman   vmscan: tracing: ...
3096
  	unsigned long nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3097
  	struct scan_control sc = {
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3098
  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3099
  		.gfp_mask = current_gfp_context(gfp_mask),
b2e18757f   Mel Gorman   mm, vmscan: begin...
3100
  		.reclaim_idx = gfp_zone(gfp_mask),
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3101
3102
3103
  		.order = order,
  		.nodemask = nodemask,
  		.priority = DEF_PRIORITY,
66e1707bc   Balbir Singh   Memory controller...
3104
  		.may_writepage = !laptop_mode,
a6dc60f89   Johannes Weiner   vmscan: rename sc...
3105
  		.may_unmap = 1,
2e2e42598   KOSAKI Motohiro   vmscan,memcg: rei...
3106
  		.may_swap = 1,
66e1707bc   Balbir Singh   Memory controller...
3107
  	};
5515061d2   Mel Gorman   mm: throttle dire...
3108
  	/*
bb451fdf3   Greg Thelen   mm/vmscan.c: cond...
3109
3110
3111
3112
3113
3114
3115
3116
  	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
  	 * Confirm they are large enough for max values.
  	 */
  	BUILD_BUG_ON(MAX_ORDER > S8_MAX);
  	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
  	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
  
  	/*
50694c28f   Mel Gorman   mm: vmscan: check...
3117
3118
3119
  	 * Do not enter reclaim if fatal signal was delivered while throttled.
  	 * 1 is returned so that the page allocator does not OOM kill at this
  	 * point.
5515061d2   Mel Gorman   mm: throttle dire...
3120
  	 */
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3121
  	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
5515061d2   Mel Gorman   mm: throttle dire...
3122
  		return 1;
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3123
  	set_task_reclaim_state(current, &sc.reclaim_state);
3481c37ff   Yafang Shao   mm/vmscan: drop m...
3124
  	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
33906bc5c   Mel Gorman   vmscan: tracing: ...
3125

3115cd914   Vladimir Davydov   mm: vmscan: remov...
3126
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
33906bc5c   Mel Gorman   vmscan: tracing: ...
3127
3128
  
  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3129
  	set_task_reclaim_state(current, NULL);
33906bc5c   Mel Gorman   vmscan: tracing: ...
3130
3131
  
  	return nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3132
  }
c255a4580   Andrew Morton   memcg: rename con...
3133
  #ifdef CONFIG_MEMCG
66e1707bc   Balbir Singh   Memory controller...
3134

d2e5fb927   Michal Hocko   mm, memcg: do not...
3135
  /* Only used by soft limit reclaim. Do not reuse for anything else. */
a9dd0a831   Mel Gorman   mm, vmscan: make ...
3136
  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
4e4169535   Balbir Singh   memory controller...
3137
  						gfp_t gfp_mask, bool noswap,
ef8f23279   Mel Gorman   mm, memcg: move m...
3138
  						pg_data_t *pgdat,
0ae5e89c6   Ying Han   memcg: count the ...
3139
  						unsigned long *nr_scanned)
4e4169535   Balbir Singh   memory controller...
3140
  {
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
3141
  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4e4169535   Balbir Singh   memory controller...
3142
  	struct scan_control sc = {
b8f5c5664   KOSAKI Motohiro   memcg: sc.nr_to_r...
3143
  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3144
  		.target_mem_cgroup = memcg,
4e4169535   Balbir Singh   memory controller...
3145
3146
  		.may_writepage = !laptop_mode,
  		.may_unmap = 1,
b2e18757f   Mel Gorman   mm, vmscan: begin...
3147
  		.reclaim_idx = MAX_NR_ZONES - 1,
4e4169535   Balbir Singh   memory controller...
3148
  		.may_swap = !noswap,
4e4169535   Balbir Singh   memory controller...
3149
  	};
0ae5e89c6   Ying Han   memcg: count the ...
3150

d2e5fb927   Michal Hocko   mm, memcg: do not...
3151
  	WARN_ON_ONCE(!current->reclaim_state);
4e4169535   Balbir Singh   memory controller...
3152
3153
  	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
  			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3154

9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3155
  	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3481c37ff   Yafang Shao   mm/vmscan: drop m...
3156
  						      sc.gfp_mask);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3157

4e4169535   Balbir Singh   memory controller...
3158
3159
3160
  	/*
  	 * NOTE: Although we can get the priority field, using it
  	 * here is not a good idea, since it limits the pages we can scan.
a9dd0a831   Mel Gorman   mm, vmscan: make ...
3161
  	 * if we don't reclaim here, the shrink_node from balance_pgdat
4e4169535   Balbir Singh   memory controller...
3162
3163
3164
  	 * will pick up pages from other mem cgroup's as well. We hack
  	 * the priority and make it zero.
  	 */
afaf07a65   Johannes Weiner   mm: vmscan: turn ...
3165
  	shrink_lruvec(lruvec, &sc);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3166
3167
  
  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
0ae5e89c6   Ying Han   memcg: count the ...
3168
  	*nr_scanned = sc.nr_scanned;
0308f7cf1   Yafang Shao   mm/vmscan.c: calc...
3169

4e4169535   Balbir Singh   memory controller...
3170
3171
  	return sc.nr_reclaimed;
  }
72835c86c   Johannes Weiner   mm: unify remaini...
3172
  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3173
  					   unsigned long nr_pages,
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3174
  					   gfp_t gfp_mask,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3175
  					   bool may_swap)
66e1707bc   Balbir Singh   Memory controller...
3176
  {
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3177
  	unsigned long nr_reclaimed;
499118e96   Vlastimil Babka   mm: introduce mem...
3178
  	unsigned int noreclaim_flag;
66e1707bc   Balbir Singh   Memory controller...
3179
  	struct scan_control sc = {
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3180
  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7dea19f9e   Michal Hocko   mm: introduce mem...
3181
  		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
a09ed5e00   Ying Han   vmscan: change sh...
3182
  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
b2e18757f   Mel Gorman   mm, vmscan: begin...
3183
  		.reclaim_idx = MAX_NR_ZONES - 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3184
3185
3186
3187
  		.target_mem_cgroup = memcg,
  		.priority = DEF_PRIORITY,
  		.may_writepage = !laptop_mode,
  		.may_unmap = 1,
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3188
  		.may_swap = may_swap,
a09ed5e00   Ying Han   vmscan: change sh...
3189
  	};
889976dbc   Ying Han   memcg: reclaim me...
3190
  	/*
fa40d1ee9   Shakeel Butt   mm: vmscan: memco...
3191
3192
3193
  	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
  	 * equal pressure on all the nodes. This is based on the assumption that
  	 * the reclaim does not bail out early.
889976dbc   Ying Han   memcg: reclaim me...
3194
  	 */
fa40d1ee9   Shakeel Butt   mm: vmscan: memco...
3195
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
889976dbc   Ying Han   memcg: reclaim me...
3196

fa40d1ee9   Shakeel Butt   mm: vmscan: memco...
3197
  	set_task_reclaim_state(current, &sc.reclaim_state);
3481c37ff   Yafang Shao   mm/vmscan: drop m...
3198
  	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
499118e96   Vlastimil Babka   mm: introduce mem...
3199
  	noreclaim_flag = memalloc_noreclaim_save();
eb414681d   Johannes Weiner   psi: pressure sta...
3200

3115cd914   Vladimir Davydov   mm: vmscan: remov...
3201
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
eb414681d   Johannes Weiner   psi: pressure sta...
3202

499118e96   Vlastimil Babka   mm: introduce mem...
3203
  	memalloc_noreclaim_restore(noreclaim_flag);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3204
  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3205
  	set_task_reclaim_state(current, NULL);
bdce6d9eb   KOSAKI Motohiro   memcg, vmscan: ad...
3206
3207
  
  	return nr_reclaimed;
66e1707bc   Balbir Singh   Memory controller...
3208
3209
  }
  #endif
1d82de618   Mel Gorman   mm, vmscan: make ...
3210
  static void age_active_anon(struct pglist_data *pgdat,
ef8f23279   Mel Gorman   mm, memcg: move m...
3211
  				struct scan_control *sc)
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3212
  {
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3213
  	struct mem_cgroup *memcg;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
3214
  	struct lruvec *lruvec;
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3215

b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3216
3217
  	if (!total_swap_pages)
  		return;
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
3218
3219
3220
  	lruvec = mem_cgroup_lruvec(NULL, pgdat);
  	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
  		return;
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3221
3222
  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
  	do {
b91ac3743   Johannes Weiner   mm: vmscan: enfor...
3223
3224
3225
  		lruvec = mem_cgroup_lruvec(memcg, pgdat);
  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  				   sc, LRU_ACTIVE_ANON);
b95a2f2d4   Johannes Weiner   mm: vmscan: conve...
3226
3227
  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
  	} while (memcg);
f16015fbf   Johannes Weiner   mm: vmscan: disti...
3228
  }
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3229
  static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
1c30844d2   Mel Gorman   mm: reclaim small...
3230
3231
3232
3233
3234
3235
3236
  {
  	int i;
  	struct zone *zone;
  
  	/*
  	 * Check for watermark boosts top-down as the higher zones
  	 * are more likely to be boosted. Both watermarks and boosts
1eba09c15   Randy Dunlap   mm/vmscan.c: dele...
3237
  	 * should not be checked at the same time as reclaim would
1c30844d2   Mel Gorman   mm: reclaim small...
3238
3239
3240
  	 * start prematurely when there is no boosting and a lower
  	 * zone is balanced.
  	 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3241
  	for (i = highest_zoneidx; i >= 0; i--) {
1c30844d2   Mel Gorman   mm: reclaim small...
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
  		zone = pgdat->node_zones + i;
  		if (!managed_zone(zone))
  			continue;
  
  		if (zone->watermark_boost)
  			return true;
  	}
  
  	return false;
  }
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3252
3253
  /*
   * Returns true if there is an eligible zone balanced for the request order
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3254
   * and highest_zoneidx
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3255
   */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3256
  static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3257
  {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3258
3259
3260
  	int i;
  	unsigned long mark = -1;
  	struct zone *zone;
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3261

1c30844d2   Mel Gorman   mm: reclaim small...
3262
3263
3264
3265
  	/*
  	 * Check watermarks bottom-up as lower zones are more likely to
  	 * meet watermarks.
  	 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3266
  	for (i = 0; i <= highest_zoneidx; i++) {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3267
  		zone = pgdat->node_zones + i;
6256c6b49   Mel Gorman   mm, vmscan: remov...
3268

e716f2eb2   Mel Gorman   mm, vmscan: preve...
3269
3270
3271
3272
  		if (!managed_zone(zone))
  			continue;
  
  		mark = high_wmark_pages(zone);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3273
  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3274
3275
3276
3277
  			return true;
  	}
  
  	/*
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3278
  	 * If a node has no populated zone within highest_zoneidx, it does not
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3279
3280
3281
3282
3283
3284
3285
  	 * need balancing by definition. This can happen if a zone-restricted
  	 * allocation tries to wake a remote kswapd.
  	 */
  	if (mark == -1)
  		return true;
  
  	return false;
60cefed48   Johannes Weiner   mm: vmscan: fix e...
3286
  }
631b6e083   Mel Gorman   mm, vmscan: only ...
3287
3288
3289
  /* Clear pgdat state for congested, dirty or under writeback. */
  static void clear_pgdat_congested(pg_data_t *pgdat)
  {
1b05117df   Johannes Weiner   mm: vmscan: harmo...
3290
3291
3292
  	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
  
  	clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
631b6e083   Mel Gorman   mm, vmscan: only ...
3293
3294
3295
  	clear_bit(PGDAT_DIRTY, &pgdat->flags);
  	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
  }
1741c8775   Mel Gorman   mm: kswapd: keep ...
3296
  /*
5515061d2   Mel Gorman   mm: throttle dire...
3297
3298
3299
3300
3301
   * Prepare kswapd for sleeping. This verifies that there are no processes
   * waiting in throttle_direct_reclaim() and that watermarks have been met.
   *
   * Returns true if kswapd is ready to sleep
   */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3302
3303
  static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
  				int highest_zoneidx)
f50de2d38   Mel Gorman   vmscan: have kswa...
3304
  {
5515061d2   Mel Gorman   mm: throttle dire...
3305
  	/*
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3306
  	 * The throttled processes are normally woken up in balance_pgdat() as
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3307
  	 * soon as allow_direct_reclaim() is true. But there is a potential
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3308
3309
3310
3311
3312
3313
3314
3315
3316
  	 * race between when kswapd checks the watermarks and a process gets
  	 * throttled. There is also a potential race if processes get
  	 * throttled, kswapd wakes, a large process exits thereby balancing the
  	 * zones, which causes kswapd to exit balance_pgdat() before reaching
  	 * the wake up checks. If kswapd is going to sleep, no process should
  	 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
  	 * the wake up is premature, processes will wake kswapd and get
  	 * throttled again. The difference from wake ups in balance_pgdat() is
  	 * that here we are under prepare_to_wait().
5515061d2   Mel Gorman   mm: throttle dire...
3317
  	 */
9e5e36617   Vlastimil Babka   mm, vmscan: preve...
3318
3319
  	if (waitqueue_active(&pgdat->pfmemalloc_wait))
  		wake_up_all(&pgdat->pfmemalloc_wait);
f50de2d38   Mel Gorman   vmscan: have kswa...
3320

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3321
3322
3323
  	/* Hopeless node, leave it to direct reclaim */
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  		return true;
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3324
  	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3325
3326
  		clear_pgdat_congested(pgdat);
  		return true;
1d82de618   Mel Gorman   mm, vmscan: make ...
3327
  	}
333b0a459   Shantanu Goel   mm, vmscan: fix z...
3328
  	return false;
f50de2d38   Mel Gorman   vmscan: have kswa...
3329
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3330
  /*
1d82de618   Mel Gorman   mm, vmscan: make ...
3331
3332
   * kswapd shrinks a node of pages that are at or below the highest usable
   * zone that is currently unbalanced.
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3333
3334
   *
   * Returns true if kswapd scanned at least the requested number of pages to
283aba9f9   Mel Gorman   mm: vmscan: block...
3335
3336
   * reclaim or if the lack of progress was due to pages under writeback.
   * This is used to determine if the scanning priority needs to be raised.
75485363c   Mel Gorman   mm: vmscan: limit...
3337
   */
1d82de618   Mel Gorman   mm, vmscan: make ...
3338
  static bool kswapd_shrink_node(pg_data_t *pgdat,
accf62422   Vlastimil Babka   mm, kswapd: repla...
3339
  			       struct scan_control *sc)
75485363c   Mel Gorman   mm: vmscan: limit...
3340
  {
1d82de618   Mel Gorman   mm, vmscan: make ...
3341
3342
  	struct zone *zone;
  	int z;
75485363c   Mel Gorman   mm: vmscan: limit...
3343

1d82de618   Mel Gorman   mm, vmscan: make ...
3344
3345
  	/* Reclaim a number of pages proportional to the number of zones */
  	sc->nr_to_reclaim = 0;
970a39a36   Mel Gorman   mm, vmscan: avoid...
3346
  	for (z = 0; z <= sc->reclaim_idx; z++) {
1d82de618   Mel Gorman   mm, vmscan: make ...
3347
  		zone = pgdat->node_zones + z;
6aa303def   Mel Gorman   mm, vmscan: only ...
3348
  		if (!managed_zone(zone))
1d82de618   Mel Gorman   mm, vmscan: make ...
3349
  			continue;
7c954f6de   Mel Gorman   mm: vmscan: move ...
3350

1d82de618   Mel Gorman   mm, vmscan: make ...
3351
3352
  		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
  	}
7c954f6de   Mel Gorman   mm: vmscan: move ...
3353
3354
  
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3355
3356
  	 * Historically care was taken to put equal pressure on all zones but
  	 * now pressure is applied based on node LRU order.
7c954f6de   Mel Gorman   mm: vmscan: move ...
3357
  	 */
970a39a36   Mel Gorman   mm, vmscan: avoid...
3358
  	shrink_node(pgdat, sc);
283aba9f9   Mel Gorman   mm: vmscan: block...
3359

7c954f6de   Mel Gorman   mm: vmscan: move ...
3360
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3361
3362
3363
3364
3365
  	 * Fragmentation may mean that the system cannot be rebalanced for
  	 * high-order allocations. If twice the allocation size has been
  	 * reclaimed then recheck watermarks only at order-0 to prevent
  	 * excessive reclaim. Assume that a process requested a high-order
  	 * can direct reclaim/compact.
7c954f6de   Mel Gorman   mm: vmscan: move ...
3366
  	 */
9861a62c3   Vlastimil Babka   mm, compaction: c...
3367
  	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
1d82de618   Mel Gorman   mm, vmscan: make ...
3368
  		sc->order = 0;
7c954f6de   Mel Gorman   mm: vmscan: move ...
3369

b8e83b942   Mel Gorman   mm: vmscan: flatt...
3370
  	return sc->nr_scanned >= sc->nr_to_reclaim;
75485363c   Mel Gorman   mm: vmscan: limit...
3371
3372
3373
  }
  
  /*
1d82de618   Mel Gorman   mm, vmscan: make ...
3374
3375
3376
   * For kswapd, balance_pgdat() will reclaim pages across a node from zones
   * that are eligible for use by the caller until at least one zone is
   * balanced.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3377
   *
1d82de618   Mel Gorman   mm, vmscan: make ...
3378
   * Returns the order kswapd finished reclaiming at.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3379
3380
   *
   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
418589663   Mel Gorman   page allocator: u...
3381
   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
8bb4e7a2e   Wei Yang   mm: fix some typo...
3382
   * found to have free_pages <= high_wmark_pages(zone), any page in that zone
1d82de618   Mel Gorman   mm, vmscan: make ...
3383
3384
   * or lower is eligible for reclaim until at least one usable zone is
   * balanced.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3385
   */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3386
  static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3387
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3388
  	int i;
0608f43da   Andrew Morton   revert "memcg, vm...
3389
3390
  	unsigned long nr_soft_reclaimed;
  	unsigned long nr_soft_scanned;
eb414681d   Johannes Weiner   psi: pressure sta...
3391
  	unsigned long pflags;
1c30844d2   Mel Gorman   mm: reclaim small...
3392
3393
3394
  	unsigned long nr_boost_reclaim;
  	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
  	bool boosted;
1d82de618   Mel Gorman   mm, vmscan: make ...
3395
  	struct zone *zone;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3396
3397
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3398
  		.order = order,
a6dc60f89   Johannes Weiner   vmscan: rename sc...
3399
  		.may_unmap = 1,
179e96395   Andrew Morton   [PATCH] vmscan: s...
3400
  	};
93781325d   Omar Sandoval   lockdep: fix fs_r...
3401

1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3402
  	set_task_reclaim_state(current, &sc.reclaim_state);
eb414681d   Johannes Weiner   psi: pressure sta...
3403
  	psi_memstall_enter(&pflags);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3404
  	__fs_reclaim_acquire();
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
3405
  	count_vm_event(PAGEOUTRUN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3406

1c30844d2   Mel Gorman   mm: reclaim small...
3407
3408
3409
3410
3411
3412
  	/*
  	 * Account for the reclaim boost. Note that the zone boost is left in
  	 * place so that parallel allocations that are near the watermark will
  	 * stall or direct reclaim until kswapd is finished.
  	 */
  	nr_boost_reclaim = 0;
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3413
  	for (i = 0; i <= highest_zoneidx; i++) {
1c30844d2   Mel Gorman   mm: reclaim small...
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
  		zone = pgdat->node_zones + i;
  		if (!managed_zone(zone))
  			continue;
  
  		nr_boost_reclaim += zone->watermark_boost;
  		zone_boosts[i] = zone->watermark_boost;
  	}
  	boosted = nr_boost_reclaim;
  
  restart:
  	sc.priority = DEF_PRIORITY;
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
3425
  	do {
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3426
  		unsigned long nr_reclaimed = sc.nr_reclaimed;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3427
  		bool raise_priority = true;
1c30844d2   Mel Gorman   mm: reclaim small...
3428
  		bool balanced;
93781325d   Omar Sandoval   lockdep: fix fs_r...
3429
  		bool ret;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3430

97a225e69   Joonsoo Kim   mm/page_alloc: in...
3431
  		sc.reclaim_idx = highest_zoneidx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3432

86c79f6b5   Mel Gorman   mm: vmscan: do no...
3433
  		/*
84c7a7771   Mel Gorman   mm, vmscan: Have ...
3434
3435
3436
3437
3438
3439
3440
3441
  		 * If the number of buffer_heads exceeds the maximum allowed
  		 * then consider reclaiming from all zones. This has a dual
  		 * purpose -- on 64-bit systems it is expected that
  		 * buffer_heads are stripped during active rotation. On 32-bit
  		 * systems, highmem pages can pin lowmem memory and shrinking
  		 * buffers can relieve lowmem pressure. Reclaim may still not
  		 * go ahead if all eligible zones for the original allocation
  		 * request are balanced to avoid excessive reclaim from kswapd.
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3442
3443
3444
3445
  		 */
  		if (buffer_heads_over_limit) {
  			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
  				zone = pgdat->node_zones + i;
6aa303def   Mel Gorman   mm, vmscan: only ...
3446
  				if (!managed_zone(zone))
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3447
  					continue;
cc715d99e   Mel Gorman   mm: vmscan: forci...
3448

970a39a36   Mel Gorman   mm, vmscan: avoid...
3449
  				sc.reclaim_idx = i;
e1dbeda60   Andrew Morton   [PATCH] balance_p...
3450
  				break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3451
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3452
  		}
dafcb73e3   Zlatko Calusic   mm: avoid calling...
3453

86c79f6b5   Mel Gorman   mm: vmscan: do no...
3454
  		/*
1c30844d2   Mel Gorman   mm: reclaim small...
3455
3456
3457
3458
3459
3460
  		 * If the pgdat is imbalanced then ignore boosting and preserve
  		 * the watermarks for a later time and restart. Note that the
  		 * zone watermarks will be still reset at the end of balancing
  		 * on the grounds that the normal reclaim should be enough to
  		 * re-evaluate if boosting is required when kswapd next wakes.
  		 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3461
  		balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
1c30844d2   Mel Gorman   mm: reclaim small...
3462
3463
3464
3465
3466
3467
3468
3469
3470
  		if (!balanced && nr_boost_reclaim) {
  			nr_boost_reclaim = 0;
  			goto restart;
  		}
  
  		/*
  		 * If boosting is not active then only reclaim if there are no
  		 * eligible zones. Note that sc.reclaim_idx is not used as
  		 * buffer_heads_over_limit may have adjusted it.
86c79f6b5   Mel Gorman   mm: vmscan: do no...
3471
  		 */
1c30844d2   Mel Gorman   mm: reclaim small...
3472
  		if (!nr_boost_reclaim && balanced)
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3473
  			goto out;
e1dbeda60   Andrew Morton   [PATCH] balance_p...
3474

1c30844d2   Mel Gorman   mm: reclaim small...
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
  		/* Limit the priority of boosting to avoid reclaim writeback */
  		if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
  			raise_priority = false;
  
  		/*
  		 * Do not writeback or swap pages for boosted reclaim. The
  		 * intent is to relieve pressure not issue sub-optimal IO
  		 * from reclaim context. If no pages are reclaimed, the
  		 * reclaim will be aborted.
  		 */
  		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
  		sc.may_swap = !nr_boost_reclaim;
1c30844d2   Mel Gorman   mm: reclaim small...
3487

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3488
  		/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3489
3490
3491
3492
3493
  		 * Do some background aging of the anon list, to give
  		 * pages a chance to be referenced before reclaiming. All
  		 * pages are rotated regardless of classzone as this is
  		 * about consistent aging.
  		 */
ef8f23279   Mel Gorman   mm, memcg: move m...
3494
  		age_active_anon(pgdat, &sc);
1d82de618   Mel Gorman   mm, vmscan: make ...
3495
3496
  
  		/*
b7ea3c417   Mel Gorman   mm: vmscan: check...
3497
3498
3499
  		 * If we're getting trouble reclaiming, start doing writepage
  		 * even in laptop mode.
  		 */
047d72c30   Johannes Weiner   mm: remove seemin...
3500
  		if (sc.priority < DEF_PRIORITY - 2)
b7ea3c417   Mel Gorman   mm: vmscan: check...
3501
  			sc.may_writepage = 1;
1d82de618   Mel Gorman   mm, vmscan: make ...
3502
3503
3504
  		/* Call soft limit reclaim before calling shrink_node. */
  		sc.nr_scanned = 0;
  		nr_soft_scanned = 0;
ef8f23279   Mel Gorman   mm, memcg: move m...
3505
  		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
1d82de618   Mel Gorman   mm, vmscan: make ...
3506
3507
  						sc.gfp_mask, &nr_soft_scanned);
  		sc.nr_reclaimed += nr_soft_reclaimed;
b7ea3c417   Mel Gorman   mm: vmscan: check...
3508
  		/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3509
3510
3511
  		 * There should be no need to raise the scanning priority if
  		 * enough pages are already being scanned that that high
  		 * watermark would be met at 100% efficiency.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3512
  		 */
970a39a36   Mel Gorman   mm, vmscan: avoid...
3513
  		if (kswapd_shrink_node(pgdat, &sc))
1d82de618   Mel Gorman   mm, vmscan: make ...
3514
  			raise_priority = false;
5515061d2   Mel Gorman   mm: throttle dire...
3515
3516
3517
3518
3519
3520
3521
  
  		/*
  		 * If the low watermark is met there is no need for processes
  		 * to be throttled on pfmemalloc_wait as they should not be
  		 * able to safely make forward progress. Wake them
  		 */
  		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3522
  				allow_direct_reclaim(pgdat))
cfc511557   Vlastimil Babka   mm, vmscan: wake ...
3523
  			wake_up_all(&pgdat->pfmemalloc_wait);
5515061d2   Mel Gorman   mm: throttle dire...
3524

b8e83b942   Mel Gorman   mm: vmscan: flatt...
3525
  		/* Check if kswapd should be suspending */
93781325d   Omar Sandoval   lockdep: fix fs_r...
3526
3527
3528
3529
  		__fs_reclaim_release();
  		ret = try_to_freeze();
  		__fs_reclaim_acquire();
  		if (ret || kthread_should_stop())
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3530
  			break;
8357376d3   Rafael J. Wysocki   [PATCH] swsusp: I...
3531

73ce02e96   KOSAKI Motohiro   mm: stop kswapd's...
3532
  		/*
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3533
3534
  		 * Raise priority if scanning rate is too low or there was no
  		 * progress in reclaiming pages
73ce02e96   KOSAKI Motohiro   mm: stop kswapd's...
3535
  		 */
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3536
  		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
1c30844d2   Mel Gorman   mm: reclaim small...
3537
3538
3539
3540
3541
3542
3543
3544
3545
  		nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
  
  		/*
  		 * If reclaim made no progress for a boost, stop reclaim as
  		 * IO cannot be queued and it could be an infinite loop in
  		 * extreme circumstances.
  		 */
  		if (nr_boost_reclaim && !nr_reclaimed)
  			break;
c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3546
  		if (raise_priority || !nr_reclaimed)
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3547
  			sc.priority--;
1d82de618   Mel Gorman   mm, vmscan: make ...
3548
  	} while (sc.priority >= 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3549

c73322d09   Johannes Weiner   mm: fix 100% CPU ...
3550
3551
  	if (!sc.nr_reclaimed)
  		pgdat->kswapd_failures++;
b8e83b942   Mel Gorman   mm: vmscan: flatt...
3552
  out:
1c30844d2   Mel Gorman   mm: reclaim small...
3553
3554
3555
  	/* If reclaim was boosted, account for the reclaim done in this pass */
  	if (boosted) {
  		unsigned long flags;
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3556
  		for (i = 0; i <= highest_zoneidx; i++) {
1c30844d2   Mel Gorman   mm: reclaim small...
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
  			if (!zone_boosts[i])
  				continue;
  
  			/* Increments are under the zone lock */
  			zone = pgdat->node_zones + i;
  			spin_lock_irqsave(&zone->lock, flags);
  			zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
  			spin_unlock_irqrestore(&zone->lock, flags);
  		}
  
  		/*
  		 * As there is now likely space, wakeup kcompact to defragment
  		 * pageblocks.
  		 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3571
  		wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
1c30844d2   Mel Gorman   mm: reclaim small...
3572
  	}
2a2e48854   Johannes Weiner   mm: vmscan: fix I...
3573
  	snapshot_refaults(NULL, pgdat);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3574
  	__fs_reclaim_release();
eb414681d   Johannes Weiner   psi: pressure sta...
3575
  	psi_memstall_leave(&pflags);
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3576
  	set_task_reclaim_state(current, NULL);
e5ca8071f   Yafang Shao   mm/vmscan.c: add ...
3577

0abdee2bd   Mel Gorman   mm: kswapd: use t...
3578
  	/*
1d82de618   Mel Gorman   mm, vmscan: make ...
3579
3580
3581
3582
  	 * Return the order kswapd stopped reclaiming at as
  	 * prepare_kswapd_sleep() takes it into account. If another caller
  	 * entered the allocator slow path while kswapd was awake, order will
  	 * remain at the higher level.
0abdee2bd   Mel Gorman   mm: kswapd: use t...
3583
  	 */
1d82de618   Mel Gorman   mm, vmscan: make ...
3584
  	return sc.order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3585
  }
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3586
  /*
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3587
3588
3589
3590
3591
   * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
   * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
   * not a valid index then either kswapd runs for first time or kswapd couldn't
   * sleep after previous reclaim attempt (node is still unbalanced). In that
   * case return the zone index of the previous kswapd reclaim cycle.
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3592
   */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3593
3594
  static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
  					   enum zone_type prev_highest_zoneidx)
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3595
  {
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3596
  	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3597

97a225e69   Joonsoo Kim   mm/page_alloc: in...
3598
  	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3599
  }
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3600
  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3601
  				unsigned int highest_zoneidx)
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3602
3603
3604
3605
3606
3607
3608
3609
  {
  	long remaining = 0;
  	DEFINE_WAIT(wait);
  
  	if (freezing(current) || kthread_should_stop())
  		return;
  
  	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
333b0a459   Shantanu Goel   mm, vmscan: fix z...
3610
3611
3612
3613
3614
3615
3616
  	/*
  	 * Try to sleep for a short interval. Note that kcompactd will only be
  	 * woken if it is possible to sleep for a short interval. This is
  	 * deliberate on the assumption that if reclaim cannot keep an
  	 * eligible zone balanced that it's also unlikely that compaction will
  	 * succeed.
  	 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3617
  	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
fd901c953   Vlastimil Babka   mm: wake kcompact...
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
  		/*
  		 * Compaction records what page blocks it recently failed to
  		 * isolate pages from and skips them in the future scanning.
  		 * When kswapd is going to sleep, it is reasonable to assume
  		 * that pages and compaction may succeed so reset the cache.
  		 */
  		reset_isolation_suitable(pgdat);
  
  		/*
  		 * We have freed the memory, now we should compact it to make
  		 * allocation of the requested order possible.
  		 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3630
  		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
fd901c953   Vlastimil Babka   mm: wake kcompact...
3631

f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3632
  		remaining = schedule_timeout(HZ/10);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3633
3634
  
  		/*
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3635
  		 * If woken prematurely then reset kswapd_highest_zoneidx and
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3636
3637
3638
3639
  		 * order. The values will either be from a wakeup request or
  		 * the previous request that slept prematurely.
  		 */
  		if (remaining) {
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3640
3641
3642
  			WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
  					kswapd_highest_zoneidx(pgdat,
  							highest_zoneidx));
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3643
3644
3645
  
  			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
  				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3646
  		}
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3647
3648
3649
3650
3651
3652
3653
3654
  		finish_wait(&pgdat->kswapd_wait, &wait);
  		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  	}
  
  	/*
  	 * After a short sleep, check if it was a premature sleep. If not, then
  	 * go fully to sleep until explicitly woken up.
  	 */
d9f21d426   Mel Gorman   mm, vmscan: avoid...
3655
  	if (!remaining &&
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3656
  	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
  		/*
  		 * vmstat counters are not perfectly accurate and the estimated
  		 * value for counters such as NR_FREE_PAGES can deviate from the
  		 * true value by nr_online_cpus * threshold. To avoid the zone
  		 * watermarks being breached while under pressure, we reduce the
  		 * per-cpu vmstat threshold while kswapd is awake and restore
  		 * them before going back to sleep.
  		 */
  		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
1c7e7f6c0   Aaditya Kumar   mm: fix lost kswa...
3668
3669
3670
  
  		if (!kthread_should_stop())
  			schedule();
f0bc0a60b   KOSAKI Motohiro   vmscan: factor ou...
3671
3672
3673
3674
3675
3676
3677
3678
3679
  		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
  	} else {
  		if (remaining)
  			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
  		else
  			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
  	}
  	finish_wait(&pgdat->kswapd_wait, &wait);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3680
3681
  /*
   * The background pageout daemon, started as a kernel thread
4f98a2fee   Rik van Riel   vmscan: split LRU...
3682
   * from the init process.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
   *
   * This basically trickles out pages so that we have _some_
   * free memory available even if there is no other activity
   * that frees anything up. This is needed for things like routing
   * etc, where we otherwise might have all activity going on in
   * asynchronous contexts that cannot page things out.
   *
   * If there are applications that are active memory-allocators
   * (most normal use), this basically shouldn't matter.
   */
  static int kswapd(void *p)
  {
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3695
  	unsigned int alloc_order, reclaim_order;
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3696
  	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3697
3698
  	pg_data_t *pgdat = (pg_data_t*)p;
  	struct task_struct *tsk = current;
a70f73028   Rusty Russell   cpumask: replace ...
3699
  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3700

174596a0b   Rusty Russell   cpumask: convert mm/
3701
  	if (!cpumask_empty(cpumask))
c5f59f083   Mike Travis   nodemask: use new...
3702
  		set_cpus_allowed_ptr(tsk, cpumask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
  
  	/*
  	 * Tell the memory management that we're a "memory allocator",
  	 * and that if we need more memory we should get access to it
  	 * regardless (see "__alloc_pages()"). "kswapd" should
  	 * never get caught in the normal page freeing logic.
  	 *
  	 * (Kswapd normally doesn't need memory anyway, but sometimes
  	 * you need a small amount of memory in order to be able to
  	 * page out something else, and this flag essentially protects
  	 * us from recursively trying to free more memory as we're
  	 * trying to free the first piece of memory in the first place).
  	 */
930d91525   Christoph Lameter   [PATCH] Swap Migr...
3716
  	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
831441862   Rafael J. Wysocki   Freezer: make ker...
3717
  	set_freezable();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3718

5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3719
  	WRITE_ONCE(pgdat->kswapd_order, 0);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3720
  	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3721
  	for ( ; ; ) {
6f6313d48   Jeff Liu   mm/vmscan.c: try_...
3722
  		bool ret;
3e1d1d28d   Christoph Lameter   [PATCH] Cleanup p...
3723

5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3724
  		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3725
3726
  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
  							highest_zoneidx);
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3727

38087d9b0   Mel Gorman   mm, vmscan: simpl...
3728
3729
  kswapd_try_sleep:
  		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3730
  					highest_zoneidx);
215ddd666   Mel Gorman   mm: vmscan: only ...
3731

97a225e69   Joonsoo Kim   mm/page_alloc: in...
3732
  		/* Read the new order and highest_zoneidx */
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3733
  		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3734
3735
  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
  							highest_zoneidx);
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3736
  		WRITE_ONCE(pgdat->kswapd_order, 0);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3737
  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3738

8fe23e057   David Rientjes   mm: clear node in...
3739
3740
3741
3742
3743
3744
3745
3746
  		ret = try_to_freeze();
  		if (kthread_should_stop())
  			break;
  
  		/*
  		 * We can speed up thawing tasks if we don't call balance_pgdat
  		 * after returning from the refrigerator
  		 */
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
  		if (ret)
  			continue;
  
  		/*
  		 * Reclaim begins at the requested order but if a high-order
  		 * reclaim fails then kswapd falls back to reclaiming for
  		 * order-0. If that happens, kswapd will consider sleeping
  		 * for the order it finished reclaiming at (reclaim_order)
  		 * but kcompactd is woken to compact for the original
  		 * request (alloc_order).
  		 */
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3758
  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
e5146b12e   Mel Gorman   mm, vmscan: add c...
3759
  						alloc_order);
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3760
3761
  		reclaim_order = balance_pgdat(pgdat, alloc_order,
  						highest_zoneidx);
38087d9b0   Mel Gorman   mm, vmscan: simpl...
3762
3763
  		if (reclaim_order < alloc_order)
  			goto kswapd_try_sleep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3764
  	}
b0a8cc58e   Takamori Yamaguchi   mm: bugfix: set c...
3765

71abdc15a   Johannes Weiner   mm: vmscan: clear...
3766
  	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
71abdc15a   Johannes Weiner   mm: vmscan: clear...
3767

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3768
3769
3770
3771
  	return 0;
  }
  
  /*
5ecd9d403   David Rientjes   mm, page_alloc: w...
3772
3773
3774
3775
3776
   * A zone is low on free memory or too fragmented for high-order memory.  If
   * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
   * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
   * has failed or is not needed, still wake up kcompactd if only compaction is
   * needed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3777
   */
5ecd9d403   David Rientjes   mm, page_alloc: w...
3778
  void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3779
  		   enum zone_type highest_zoneidx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3780
3781
  {
  	pg_data_t *pgdat;
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3782
  	enum zone_type curr_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3783

6aa303def   Mel Gorman   mm, vmscan: only ...
3784
  	if (!managed_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3785
  		return;
5ecd9d403   David Rientjes   mm, page_alloc: w...
3786
  	if (!cpuset_zone_allowed(zone, gfp_flags))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3787
  		return;
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3788

88f5acf88   Mel Gorman   mm: page allocato...
3789
  	pgdat = zone->zone_pgdat;
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3790
  	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3791

97a225e69   Joonsoo Kim   mm/page_alloc: in...
3792
3793
  	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
5644e1fbb   Qian Cai   mm/vmscan.c: fix ...
3794
3795
3796
  
  	if (READ_ONCE(pgdat->kswapd_order) < order)
  		WRITE_ONCE(pgdat->kswapd_order, order);
dffcac2cb   Shakeel Butt   mm/vmscan.c: prev...
3797

8d0986e28   Con Kolivas   [PATCH] vm: kswap...
3798
  	if (!waitqueue_active(&pgdat->kswapd_wait))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3799
  		return;
e1a556374   Mel Gorman   mm, vmscan: only ...
3800

5ecd9d403   David Rientjes   mm, page_alloc: w...
3801
3802
  	/* Hopeless node, leave it to direct reclaim if possible */
  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3803
3804
  	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
  	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
5ecd9d403   David Rientjes   mm, page_alloc: w...
3805
3806
3807
3808
3809
3810
3811
3812
  		/*
  		 * There may be plenty of free memory available, but it's too
  		 * fragmented for high-order allocations.  Wake up kcompactd
  		 * and rely on compaction_suitable() to determine if it's
  		 * needed.  If it fails, it will defer subsequent attempts to
  		 * ratelimit its work.
  		 */
  		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
97a225e69   Joonsoo Kim   mm/page_alloc: in...
3813
  			wakeup_kcompactd(pgdat, order, highest_zoneidx);
e716f2eb2   Mel Gorman   mm, vmscan: preve...
3814
  		return;
5ecd9d403   David Rientjes   mm, page_alloc: w...
3815
  	}
88f5acf88   Mel Gorman   mm: page allocato...
3816

97a225e69   Joonsoo Kim   mm/page_alloc: in...
3817
  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
5ecd9d403   David Rientjes   mm, page_alloc: w...
3818
  				      gfp_flags);
8d0986e28   Con Kolivas   [PATCH] vm: kswap...
3819
  	wake_up_interruptible(&pgdat->kswapd_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3820
  }
c6f37f121   Rafael J. Wysocki   PM/Suspend: Do no...
3821
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3822
  /*
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3823
   * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3824
3825
3826
3827
3828
   * freed pages.
   *
   * Rather than trying to age LRUs the aim is to preserve the overall
   * LRU order by reclaiming preferentially
   * inactive > active > active referenced > active mapped
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3829
   */
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3830
  unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3831
  {
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3832
  	struct scan_control sc = {
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3833
  		.nr_to_reclaim = nr_to_reclaim,
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3834
  		.gfp_mask = GFP_HIGHUSER_MOVABLE,
b2e18757f   Mel Gorman   mm, vmscan: begin...
3835
  		.reclaim_idx = MAX_NR_ZONES - 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3836
  		.priority = DEF_PRIORITY,
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3837
  		.may_writepage = 1,
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3838
3839
  		.may_unmap = 1,
  		.may_swap = 1,
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3840
  		.hibernation_mode = 1,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3841
  	};
a09ed5e00   Ying Han   vmscan: change sh...
3842
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3843
  	unsigned long nr_reclaimed;
499118e96   Vlastimil Babka   mm: introduce mem...
3844
  	unsigned int noreclaim_flag;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3845

d92a8cfcb   Peter Zijlstra   locking/lockdep: ...
3846
  	fs_reclaim_acquire(sc.gfp_mask);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3847
  	noreclaim_flag = memalloc_noreclaim_save();
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3848
  	set_task_reclaim_state(current, &sc.reclaim_state);
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3849

3115cd914   Vladimir Davydov   mm: vmscan: remov...
3850
  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
d979677c4   MinChan Kim   mm: shrink_all_me...
3851

1732d2b01   Andrew Morton   mm/vmscan.c: add ...
3852
  	set_task_reclaim_state(current, NULL);
499118e96   Vlastimil Babka   mm: introduce mem...
3853
  	memalloc_noreclaim_restore(noreclaim_flag);
93781325d   Omar Sandoval   lockdep: fix fs_r...
3854
  	fs_reclaim_release(sc.gfp_mask);
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
3855

7b51755c3   KOSAKI Motohiro   vmscan: kill hibe...
3856
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3857
  }
c6f37f121   Rafael J. Wysocki   PM/Suspend: Do no...
3858
  #endif /* CONFIG_HIBERNATION */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3859

3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
  /*
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
   */
  int kswapd_run(int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  	int ret = 0;
  
  	if (pgdat->kswapd)
  		return 0;
  
  	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
  	if (IS_ERR(pgdat->kswapd)) {
  		/* failure at boot is fatal */
c6202adf3   Thomas Gleixner   mm/vmscan: Adjust...
3875
  		BUG_ON(system_state < SYSTEM_RUNNING);
d5dc0ad92   Gavin Shan   mm/vmscan: fix er...
3876
3877
3878
  		pr_err("Failed to start kswapd on node %d
  ", nid);
  		ret = PTR_ERR(pgdat->kswapd);
d72515b85   Xishi Qiu   mm/vmscan: fix er...
3879
  		pgdat->kswapd = NULL;
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3880
3881
3882
  	}
  	return ret;
  }
8fe23e057   David Rientjes   mm: clear node in...
3883
  /*
d8adde17e   Jiang Liu   memory hotplug: f...
3884
   * Called by memory hotplug when all memory in a node is offlined.  Caller must
bfc8c9013   Vladimir Davydov   mem-hotplug: impl...
3885
   * hold mem_hotplug_begin/end().
8fe23e057   David Rientjes   mm: clear node in...
3886
3887
3888
3889
   */
  void kswapd_stop(int nid)
  {
  	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
d8adde17e   Jiang Liu   memory hotplug: f...
3890
  	if (kswapd) {
8fe23e057   David Rientjes   mm: clear node in...
3891
  		kthread_stop(kswapd);
d8adde17e   Jiang Liu   memory hotplug: f...
3892
3893
  		NODE_DATA(nid)->kswapd = NULL;
  	}
8fe23e057   David Rientjes   mm: clear node in...
3894
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3895
3896
  static int __init kswapd_init(void)
  {
6b700b5b3   Wei Yang   mm/vmscan.c: remo...
3897
  	int nid;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
3898

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3899
  	swap_setup();
48fb2e240   Lai Jiangshan   vmscan: use N_MEM...
3900
  	for_each_node_state(nid, N_MEMORY)
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
3901
   		kswapd_run(nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3902
3903
3904
3905
  	return 0;
  }
  
  module_init(kswapd_init)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3906
3907
3908
  
  #ifdef CONFIG_NUMA
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3909
   * Node reclaim mode
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3910
   *
a5f5f91da   Mel Gorman   mm: convert zone_...
3911
   * If non-zero call node_reclaim when the number of free pages falls below
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3912
   * the watermarks.
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3913
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3914
  int node_reclaim_mode __read_mostly;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3915

648b5cf36   Alex Shi   mm/vmscan: remove...
3916
3917
  #define RECLAIM_WRITE (1<<0)	/* Writeout pages during reclaim */
  #define RECLAIM_UNMAP (1<<1)	/* Unmap pages during reclaim */
1b2ffb789   Christoph Lameter   [PATCH] Zone recl...
3918

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3919
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3920
   * Priority for NODE_RECLAIM. This determines the fraction of pages
a92f71263   Christoph Lameter   [PATCH] zone_recl...
3921
3922
3923
   * of a node considered for each zone_reclaim. 4 scans 1/16th of
   * a zone.
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3924
  #define NODE_RECLAIM_PRIORITY 4
a92f71263   Christoph Lameter   [PATCH] zone_recl...
3925

9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3926
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3927
   * Percentage of pages in a zone that must be unmapped for node_reclaim to
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3928
3929
3930
3931
3932
   * occur.
   */
  int sysctl_min_unmapped_ratio = 1;
  
  /*
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3933
3934
3935
3936
   * If the number of slab pages in a zone grows beyond this percentage then
   * slab reclaim needs to occur.
   */
  int sysctl_min_slab_ratio = 5;
11fb99898   Mel Gorman   mm: move most fil...
3937
  static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
90afa5de6   Mel Gorman   vmscan: properly ...
3938
  {
11fb99898   Mel Gorman   mm: move most fil...
3939
3940
3941
  	unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
  	unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
  		node_page_state(pgdat, NR_ACTIVE_FILE);
90afa5de6   Mel Gorman   vmscan: properly ...
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
  
  	/*
  	 * It's possible for there to be more file mapped pages than
  	 * accounted for by the pages on the file LRU lists because
  	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
  	 */
  	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
a5f5f91da   Mel Gorman   mm: convert zone_...
3952
  static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
90afa5de6   Mel Gorman   vmscan: properly ...
3953
  {
d031a1579   Alexandru Moise   mm/vmscan.c: fix ...
3954
3955
  	unsigned long nr_pagecache_reclaimable;
  	unsigned long delta = 0;
90afa5de6   Mel Gorman   vmscan: properly ...
3956
3957
  
  	/*
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
3958
  	 * If RECLAIM_UNMAP is set, then all file pages are considered
90afa5de6   Mel Gorman   vmscan: properly ...
3959
  	 * potentially reclaimable. Otherwise, we have to worry about
11fb99898   Mel Gorman   mm: move most fil...
3960
  	 * pages like swapcache and node_unmapped_file_pages() provides
90afa5de6   Mel Gorman   vmscan: properly ...
3961
3962
  	 * a better estimate
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
3963
3964
  	if (node_reclaim_mode & RECLAIM_UNMAP)
  		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
90afa5de6   Mel Gorman   vmscan: properly ...
3965
  	else
a5f5f91da   Mel Gorman   mm: convert zone_...
3966
  		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
90afa5de6   Mel Gorman   vmscan: properly ...
3967
3968
  
  	/* If we can't clean pages, remove dirty pages from consideration */
a5f5f91da   Mel Gorman   mm: convert zone_...
3969
3970
  	if (!(node_reclaim_mode & RECLAIM_WRITE))
  		delta += node_page_state(pgdat, NR_FILE_DIRTY);
90afa5de6   Mel Gorman   vmscan: properly ...
3971
3972
3973
3974
3975
3976
3977
  
  	/* Watch for any possible underflows due to delta */
  	if (unlikely(delta > nr_pagecache_reclaimable))
  		delta = nr_pagecache_reclaimable;
  
  	return nr_pagecache_reclaimable - delta;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3978
  /*
a5f5f91da   Mel Gorman   mm: convert zone_...
3979
   * Try to free up some pages from this node through reclaim.
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3980
   */
a5f5f91da   Mel Gorman   mm: convert zone_...
3981
  static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3982
  {
7fb2d46d3   Christoph Lameter   [PATCH] zone_recl...
3983
  	/* Minimum pages needed in order to stay on node */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
3984
  	const unsigned long nr_pages = 1 << order;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3985
  	struct task_struct *p = current;
499118e96   Vlastimil Babka   mm: introduce mem...
3986
  	unsigned int noreclaim_flag;
179e96395   Andrew Morton   [PATCH] vmscan: s...
3987
  	struct scan_control sc = {
62b726c1b   Andrew Morton   mm/vmscan.c:__zon...
3988
  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3989
  		.gfp_mask = current_gfp_context(gfp_mask),
bd2f6199c   Johannes Weiner   vmscan: respect h...
3990
  		.order = order,
a5f5f91da   Mel Gorman   mm: convert zone_...
3991
3992
3993
  		.priority = NODE_RECLAIM_PRIORITY,
  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
ee814fe23   Johannes Weiner   mm: vmscan: clean...
3994
  		.may_swap = 1,
f2f43e566   Nick Desaulniers   mm/vmscan.c: fix ...
3995
  		.reclaim_idx = gfp_zone(gfp_mask),
179e96395   Andrew Morton   [PATCH] vmscan: s...
3996
  	};
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3997

132bb8cfc   Yafang Shao   mm/vmscan: add tr...
3998
3999
  	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
  					   sc.gfp_mask);
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
4000
  	cond_resched();
93781325d   Omar Sandoval   lockdep: fix fs_r...
4001
  	fs_reclaim_acquire(sc.gfp_mask);
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
4002
  	/*
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
4003
  	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
4004
  	 * and we also need to be able to write out pages for RECLAIM_WRITE
95bbc0c72   Zhihui Zhang   mm: rename RECLAI...
4005
  	 * and RECLAIM_UNMAP.
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
4006
  	 */
499118e96   Vlastimil Babka   mm: introduce mem...
4007
4008
  	noreclaim_flag = memalloc_noreclaim_save();
  	p->flags |= PF_SWAPWRITE;
1732d2b01   Andrew Morton   mm/vmscan.c: add ...
4009
  	set_task_reclaim_state(p, &sc.reclaim_state);
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
4010

a5f5f91da   Mel Gorman   mm: convert zone_...
4011
  	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4012
  		/*
894befec4   Andrey Ryabinin   mm/vmscan: update...
4013
  		 * Free memory by calling shrink node with increasing
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4014
4015
  		 * priorities until we have enough memory freed.
  		 */
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4016
  		do {
970a39a36   Mel Gorman   mm, vmscan: avoid...
4017
  			shrink_node(pgdat, &sc);
9e3b2f8cd   Konstantin Khlebnikov   mm/vmscan: store ...
4018
  		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4019
  	}
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
4020

1732d2b01   Andrew Morton   mm/vmscan.c: add ...
4021
  	set_task_reclaim_state(p, NULL);
499118e96   Vlastimil Babka   mm: introduce mem...
4022
4023
  	current->flags &= ~PF_SWAPWRITE;
  	memalloc_noreclaim_restore(noreclaim_flag);
93781325d   Omar Sandoval   lockdep: fix fs_r...
4024
  	fs_reclaim_release(sc.gfp_mask);
132bb8cfc   Yafang Shao   mm/vmscan: add tr...
4025
4026
  
  	trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
a79311c14   Rik van Riel   vmscan: bail out ...
4027
  	return sc.nr_reclaimed >= nr_pages;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
4028
  }
179e96395   Andrew Morton   [PATCH] vmscan: s...
4029

a5f5f91da   Mel Gorman   mm: convert zone_...
4030
  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
179e96395   Andrew Morton   [PATCH] vmscan: s...
4031
  {
d773ed6b8   David Rientjes   mm: test and set ...
4032
  	int ret;
179e96395   Andrew Morton   [PATCH] vmscan: s...
4033
4034
  
  	/*
a5f5f91da   Mel Gorman   mm: convert zone_...
4035
  	 * Node reclaim reclaims unmapped file backed pages and
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4036
  	 * slab pages if we are over the defined limits.
34aa1330f   Christoph Lameter   [PATCH] zoned vm ...
4037
  	 *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4038
4039
  	 * A small portion of unmapped file backed pages is needed for
  	 * file I/O otherwise pages read by file I/O will be immediately
a5f5f91da   Mel Gorman   mm: convert zone_...
4040
4041
  	 * thrown out if the node is overallocated. So we do not reclaim
  	 * if less than a specified percentage of the node is used by
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4042
  	 * unmapped file backed pages.
179e96395   Andrew Morton   [PATCH] vmscan: s...
4043
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
4044
  	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
d42f3245c   Roman Gushchin   mm: memcg: conver...
4045
4046
  	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
  	    pgdat->min_slab_pages)
a5f5f91da   Mel Gorman   mm: convert zone_...
4047
  		return NODE_RECLAIM_FULL;
179e96395   Andrew Morton   [PATCH] vmscan: s...
4048
4049
  
  	/*
d773ed6b8   David Rientjes   mm: test and set ...
4050
  	 * Do not scan if the allocation should not be delayed.
179e96395   Andrew Morton   [PATCH] vmscan: s...
4051
  	 */
d0164adc8   Mel Gorman   mm, page_alloc: d...
4052
  	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
a5f5f91da   Mel Gorman   mm: convert zone_...
4053
  		return NODE_RECLAIM_NOSCAN;
179e96395   Andrew Morton   [PATCH] vmscan: s...
4054
4055
  
  	/*
a5f5f91da   Mel Gorman   mm: convert zone_...
4056
  	 * Only run node reclaim on the local node or on nodes that do not
179e96395   Andrew Morton   [PATCH] vmscan: s...
4057
4058
4059
4060
  	 * have associated processors. This will favor the local processor
  	 * over remote processors and spread off node memory allocations
  	 * as wide as possible.
  	 */
a5f5f91da   Mel Gorman   mm: convert zone_...
4061
4062
  	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
  		return NODE_RECLAIM_NOSCAN;
d773ed6b8   David Rientjes   mm: test and set ...
4063

a5f5f91da   Mel Gorman   mm: convert zone_...
4064
4065
  	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
  		return NODE_RECLAIM_NOSCAN;
fa5e084e4   Mel Gorman   vmscan: do not un...
4066

a5f5f91da   Mel Gorman   mm: convert zone_...
4067
4068
  	ret = __node_reclaim(pgdat, gfp_mask, order);
  	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
d773ed6b8   David Rientjes   mm: test and set ...
4069

24cf72518   Mel Gorman   vmscan: count the...
4070
4071
  	if (!ret)
  		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
d773ed6b8   David Rientjes   mm: test and set ...
4072
  	return ret;
179e96395   Andrew Morton   [PATCH] vmscan: s...
4073
  }
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
4074
  #endif
894bc3104   Lee Schermerhorn   Unevictable LRU I...
4075

89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4076
  /**
64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
4077
4078
4079
   * check_move_unevictable_pages - check pages for evictability and move to
   * appropriate zone lru list
   * @pvec: pagevec with lru pages to check
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4080
   *
64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
4081
4082
4083
   * Checks pages for evictability, if an evictable page is in the unevictable
   * lru list, moves it to the appropriate evictable lru list. This function
   * should be only used for lru pages.
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4084
   */
64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
4085
  void check_move_unevictable_pages(struct pagevec *pvec)
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4086
  {
925b7673c   Johannes Weiner   mm: make per-memc...
4087
  	struct lruvec *lruvec;
785b99feb   Mel Gorman   mm, vmscan: relea...
4088
  	struct pglist_data *pgdat = NULL;
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4089
4090
4091
  	int pgscanned = 0;
  	int pgrescued = 0;
  	int i;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4092

64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
4093
4094
  	for (i = 0; i < pvec->nr; i++) {
  		struct page *page = pvec->pages[i];
785b99feb   Mel Gorman   mm, vmscan: relea...
4095
  		struct pglist_data *pagepgdat = page_pgdat(page);
8d8869ca5   Hugh Dickins   mm: fix check_mov...
4096
4097
4098
4099
4100
4101
4102
  		int nr_pages;
  
  		if (PageTransTail(page))
  			continue;
  
  		nr_pages = thp_nr_pages(page);
  		pgscanned += nr_pages;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4103

785b99feb   Mel Gorman   mm, vmscan: relea...
4104
4105
4106
4107
4108
  		if (pagepgdat != pgdat) {
  			if (pgdat)
  				spin_unlock_irq(&pgdat->lru_lock);
  			pgdat = pagepgdat;
  			spin_lock_irq(&pgdat->lru_lock);
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4109
  		}
785b99feb   Mel Gorman   mm, vmscan: relea...
4110
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4111

245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4112
4113
  		if (!PageLRU(page) || !PageUnevictable(page))
  			continue;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4114

39b5f29ac   Hugh Dickins   mm: remove vma ar...
4115
  		if (page_evictable(page)) {
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4116
  			enum lru_list lru = page_lru_base_type(page);
309381fea   Sasha Levin   mm: dump page whe...
4117
  			VM_BUG_ON_PAGE(PageActive(page), page);
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4118
  			ClearPageUnevictable(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
4119
4120
  			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
  			add_page_to_lru_list(page, lruvec, lru);
8d8869ca5   Hugh Dickins   mm: fix check_mov...
4121
  			pgrescued += nr_pages;
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4122
  		}
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4123
  	}
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4124

785b99feb   Mel Gorman   mm, vmscan: relea...
4125
  	if (pgdat) {
245132643   Hugh Dickins   SHM_UNLOCK: fix U...
4126
4127
  		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
  		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
785b99feb   Mel Gorman   mm, vmscan: relea...
4128
  		spin_unlock_irq(&pgdat->lru_lock);
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4129
  	}
89e004ea5   Lee Schermerhorn   SHM_LOCKED pages ...
4130
  }
64e3d12f7   Kuo-Hsin Yang   mm, drm/i915: mar...
4131
  EXPORT_SYMBOL_GPL(check_move_unevictable_pages);