Commit 2ce666c175bb502cae050c97364acf48449b9d6a

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent 2d37a72e40

mm: vmscan: use proportional scanning during direct reclaim and full scan at DEF_PRIORITY

commit 1a501907bbea8e6ebb0b16cf6db9e9cbf1d2c813 upstream.

Commit "mm: vmscan: obey proportional scanning requirements for kswapd"
ensured that file/anon lists were scanned proportionally for reclaim from
kswapd but ignored it for direct reclaim.  The intent was to minimse
direct reclaim latency but Yuanhan Liu pointer out that it substitutes one
long stall for many small stalls and distorts aging for normal workloads
like streaming readers/writers.  Hugh Dickins pointed out that a
side-effect of the same commit was that when one LRU list dropped to zero
that the entirety of the other list was shrunk leading to excessive
reclaim in memcgs.  This patch scans the file/anon lists proportionally
for direct reclaim to similarly age page whether reclaimed by kswapd or
direct reclaim but takes care to abort reclaim if one LRU drops to zero
after reclaiming the requested number of pages.

Based on ext4 and using the Intel VM scalability test

                                              3.15.0-rc5            3.15.0-rc5
                                                shrinker            proportion
Unit  lru-file-readonce    elapsed      5.3500 (  0.00%)      5.4200 ( -1.31%)
Unit  lru-file-readonce time_range      0.2700 (  0.00%)      0.1400 ( 48.15%)
Unit  lru-file-readonce time_stddv      0.1148 (  0.00%)      0.0536 ( 53.33%)
Unit lru-file-readtwice    elapsed      8.1700 (  0.00%)      8.1700 (  0.00%)
Unit lru-file-readtwice time_range      0.4300 (  0.00%)      0.2300 ( 46.51%)
Unit lru-file-readtwice time_stddv      0.1650 (  0.00%)      0.0971 ( 41.16%)

The test cases are running multiple dd instances reading sparse files. The results are within
the noise for the small test machine. The impact of the patch is more noticable from the vmstats

                            3.15.0-rc5  3.15.0-rc5
                              shrinker  proportion
Minor Faults                     35154       36784
Major Faults                       611        1305
Swap Ins                           394        1651
Swap Outs                         4394        5891
Allocation stalls               118616       44781
Direct pages scanned           4935171     4602313
Kswapd pages scanned          15921292    16258483
Kswapd pages reclaimed        15913301    16248305
Direct pages reclaimed         4933368     4601133
Kswapd efficiency                  99%         99%
Kswapd velocity             670088.047  682555.961
Direct efficiency                  99%         99%
Direct velocity             207709.217  193212.133
Percentage direct scans            23%         22%
Page writes by reclaim        4858.000    6232.000
Page writes file                   464         341
Page writes anon                  4394        5891

Note that there are fewer allocation stalls even though the amount
of direct reclaim scanning is very approximately the same.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 25 additions and 11 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/vmscan.c 2 * linux/mm/vmscan.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * 5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct 7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 * Multiqueue VM started 5.8.00, Rik van Riel. 11 * Multiqueue VM started 5.8.00, Rik van Riel.
12 */ 12 */
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/gfp.h> 16 #include <linux/gfp.h>
17 #include <linux/kernel_stat.h> 17 #include <linux/kernel_stat.h>
18 #include <linux/swap.h> 18 #include <linux/swap.h>
19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h>
20 #include <linux/init.h> 20 #include <linux/init.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/vmpressure.h> 22 #include <linux/vmpressure.h>
23 #include <linux/vmstat.h> 23 #include <linux/vmstat.h>
24 #include <linux/file.h> 24 #include <linux/file.h>
25 #include <linux/writeback.h> 25 #include <linux/writeback.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/buffer_head.h> /* for try_to_release_page(), 27 #include <linux/buffer_head.h> /* for try_to_release_page(),
28 buffer_heads_over_limit */ 28 buffer_heads_over_limit */
29 #include <linux/mm_inline.h> 29 #include <linux/mm_inline.h>
30 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h>
31 #include <linux/rmap.h> 31 #include <linux/rmap.h>
32 #include <linux/topology.h> 32 #include <linux/topology.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/cpuset.h> 34 #include <linux/cpuset.h>
35 #include <linux/compaction.h> 35 #include <linux/compaction.h>
36 #include <linux/notifier.h> 36 #include <linux/notifier.h>
37 #include <linux/rwsem.h> 37 #include <linux/rwsem.h>
38 #include <linux/delay.h> 38 #include <linux/delay.h>
39 #include <linux/kthread.h> 39 #include <linux/kthread.h>
40 #include <linux/freezer.h> 40 #include <linux/freezer.h>
41 #include <linux/memcontrol.h> 41 #include <linux/memcontrol.h>
42 #include <linux/delayacct.h> 42 #include <linux/delayacct.h>
43 #include <linux/sysctl.h> 43 #include <linux/sysctl.h>
44 #include <linux/oom.h> 44 #include <linux/oom.h>
45 #include <linux/prefetch.h> 45 #include <linux/prefetch.h>
46 46
47 #include <asm/tlbflush.h> 47 #include <asm/tlbflush.h>
48 #include <asm/div64.h> 48 #include <asm/div64.h>
49 49
50 #include <linux/swapops.h> 50 #include <linux/swapops.h>
51 #include <linux/balloon_compaction.h> 51 #include <linux/balloon_compaction.h>
52 52
53 #include "internal.h" 53 #include "internal.h"
54 54
55 #define CREATE_TRACE_POINTS 55 #define CREATE_TRACE_POINTS
56 #include <trace/events/vmscan.h> 56 #include <trace/events/vmscan.h>
57 57
58 struct scan_control { 58 struct scan_control {
59 /* Incremented by the number of inactive pages that were scanned */ 59 /* Incremented by the number of inactive pages that were scanned */
60 unsigned long nr_scanned; 60 unsigned long nr_scanned;
61 61
62 /* Number of pages freed so far during a call to shrink_zones() */ 62 /* Number of pages freed so far during a call to shrink_zones() */
63 unsigned long nr_reclaimed; 63 unsigned long nr_reclaimed;
64 64
65 /* How many pages shrink_list() should reclaim */ 65 /* How many pages shrink_list() should reclaim */
66 unsigned long nr_to_reclaim; 66 unsigned long nr_to_reclaim;
67 67
68 unsigned long hibernation_mode; 68 unsigned long hibernation_mode;
69 69
70 /* This context's GFP mask */ 70 /* This context's GFP mask */
71 gfp_t gfp_mask; 71 gfp_t gfp_mask;
72 72
73 int may_writepage; 73 int may_writepage;
74 74
75 /* Can mapped pages be reclaimed? */ 75 /* Can mapped pages be reclaimed? */
76 int may_unmap; 76 int may_unmap;
77 77
78 /* Can pages be swapped as part of reclaim? */ 78 /* Can pages be swapped as part of reclaim? */
79 int may_swap; 79 int may_swap;
80 80
81 int order; 81 int order;
82 82
83 /* Scan (total_size >> priority) pages at once */ 83 /* Scan (total_size >> priority) pages at once */
84 int priority; 84 int priority;
85 85
86 /* 86 /*
87 * The memory cgroup that hit its limit and as a result is the 87 * The memory cgroup that hit its limit and as a result is the
88 * primary target of this reclaim invocation. 88 * primary target of this reclaim invocation.
89 */ 89 */
90 struct mem_cgroup *target_mem_cgroup; 90 struct mem_cgroup *target_mem_cgroup;
91 91
92 /* 92 /*
93 * Nodemask of nodes allowed by the caller. If NULL, all nodes 93 * Nodemask of nodes allowed by the caller. If NULL, all nodes
94 * are scanned. 94 * are scanned.
95 */ 95 */
96 nodemask_t *nodemask; 96 nodemask_t *nodemask;
97 }; 97 };
98 98
99 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 99 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
100 100
101 #ifdef ARCH_HAS_PREFETCH 101 #ifdef ARCH_HAS_PREFETCH
102 #define prefetch_prev_lru_page(_page, _base, _field) \ 102 #define prefetch_prev_lru_page(_page, _base, _field) \
103 do { \ 103 do { \
104 if ((_page)->lru.prev != _base) { \ 104 if ((_page)->lru.prev != _base) { \
105 struct page *prev; \ 105 struct page *prev; \
106 \ 106 \
107 prev = lru_to_page(&(_page->lru)); \ 107 prev = lru_to_page(&(_page->lru)); \
108 prefetch(&prev->_field); \ 108 prefetch(&prev->_field); \
109 } \ 109 } \
110 } while (0) 110 } while (0)
111 #else 111 #else
112 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 112 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
113 #endif 113 #endif
114 114
115 #ifdef ARCH_HAS_PREFETCHW 115 #ifdef ARCH_HAS_PREFETCHW
116 #define prefetchw_prev_lru_page(_page, _base, _field) \ 116 #define prefetchw_prev_lru_page(_page, _base, _field) \
117 do { \ 117 do { \
118 if ((_page)->lru.prev != _base) { \ 118 if ((_page)->lru.prev != _base) { \
119 struct page *prev; \ 119 struct page *prev; \
120 \ 120 \
121 prev = lru_to_page(&(_page->lru)); \ 121 prev = lru_to_page(&(_page->lru)); \
122 prefetchw(&prev->_field); \ 122 prefetchw(&prev->_field); \
123 } \ 123 } \
124 } while (0) 124 } while (0)
125 #else 125 #else
126 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 126 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
127 #endif 127 #endif
128 128
129 /* 129 /*
130 * From 0 .. 100. Higher means more swappy. 130 * From 0 .. 100. Higher means more swappy.
131 */ 131 */
132 int vm_swappiness = 60; 132 int vm_swappiness = 60;
133 unsigned long vm_total_pages; /* The total number of pages which the VM controls */ 133 unsigned long vm_total_pages; /* The total number of pages which the VM controls */
134 134
135 static LIST_HEAD(shrinker_list); 135 static LIST_HEAD(shrinker_list);
136 static DECLARE_RWSEM(shrinker_rwsem); 136 static DECLARE_RWSEM(shrinker_rwsem);
137 137
138 #ifdef CONFIG_MEMCG 138 #ifdef CONFIG_MEMCG
139 static bool global_reclaim(struct scan_control *sc) 139 static bool global_reclaim(struct scan_control *sc)
140 { 140 {
141 return !sc->target_mem_cgroup; 141 return !sc->target_mem_cgroup;
142 } 142 }
143 #else 143 #else
144 static bool global_reclaim(struct scan_control *sc) 144 static bool global_reclaim(struct scan_control *sc)
145 { 145 {
146 return true; 146 return true;
147 } 147 }
148 #endif 148 #endif
149 149
150 static unsigned long zone_reclaimable_pages(struct zone *zone) 150 static unsigned long zone_reclaimable_pages(struct zone *zone)
151 { 151 {
152 int nr; 152 int nr;
153 153
154 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 154 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
155 zone_page_state(zone, NR_INACTIVE_FILE); 155 zone_page_state(zone, NR_INACTIVE_FILE);
156 156
157 if (get_nr_swap_pages() > 0) 157 if (get_nr_swap_pages() > 0)
158 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 158 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
159 zone_page_state(zone, NR_INACTIVE_ANON); 159 zone_page_state(zone, NR_INACTIVE_ANON);
160 160
161 return nr; 161 return nr;
162 } 162 }
163 163
164 bool zone_reclaimable(struct zone *zone) 164 bool zone_reclaimable(struct zone *zone)
165 { 165 {
166 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 166 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
167 } 167 }
168 168
169 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 169 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
170 { 170 {
171 if (!mem_cgroup_disabled()) 171 if (!mem_cgroup_disabled())
172 return mem_cgroup_get_lru_size(lruvec, lru); 172 return mem_cgroup_get_lru_size(lruvec, lru);
173 173
174 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); 174 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
175 } 175 }
176 176
177 /* 177 /*
178 * Add a shrinker callback to be called from the vm. 178 * Add a shrinker callback to be called from the vm.
179 */ 179 */
180 int register_shrinker(struct shrinker *shrinker) 180 int register_shrinker(struct shrinker *shrinker)
181 { 181 {
182 size_t size = sizeof(*shrinker->nr_deferred); 182 size_t size = sizeof(*shrinker->nr_deferred);
183 183
184 /* 184 /*
185 * If we only have one possible node in the system anyway, save 185 * If we only have one possible node in the system anyway, save
186 * ourselves the trouble and disable NUMA aware behavior. This way we 186 * ourselves the trouble and disable NUMA aware behavior. This way we
187 * will save memory and some small loop time later. 187 * will save memory and some small loop time later.
188 */ 188 */
189 if (nr_node_ids == 1) 189 if (nr_node_ids == 1)
190 shrinker->flags &= ~SHRINKER_NUMA_AWARE; 190 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
191 191
192 if (shrinker->flags & SHRINKER_NUMA_AWARE) 192 if (shrinker->flags & SHRINKER_NUMA_AWARE)
193 size *= nr_node_ids; 193 size *= nr_node_ids;
194 194
195 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 195 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
196 if (!shrinker->nr_deferred) 196 if (!shrinker->nr_deferred)
197 return -ENOMEM; 197 return -ENOMEM;
198 198
199 down_write(&shrinker_rwsem); 199 down_write(&shrinker_rwsem);
200 list_add_tail(&shrinker->list, &shrinker_list); 200 list_add_tail(&shrinker->list, &shrinker_list);
201 up_write(&shrinker_rwsem); 201 up_write(&shrinker_rwsem);
202 return 0; 202 return 0;
203 } 203 }
204 EXPORT_SYMBOL(register_shrinker); 204 EXPORT_SYMBOL(register_shrinker);
205 205
206 /* 206 /*
207 * Remove one 207 * Remove one
208 */ 208 */
209 void unregister_shrinker(struct shrinker *shrinker) 209 void unregister_shrinker(struct shrinker *shrinker)
210 { 210 {
211 down_write(&shrinker_rwsem); 211 down_write(&shrinker_rwsem);
212 list_del(&shrinker->list); 212 list_del(&shrinker->list);
213 up_write(&shrinker_rwsem); 213 up_write(&shrinker_rwsem);
214 kfree(shrinker->nr_deferred); 214 kfree(shrinker->nr_deferred);
215 } 215 }
216 EXPORT_SYMBOL(unregister_shrinker); 216 EXPORT_SYMBOL(unregister_shrinker);
217 217
218 #define SHRINK_BATCH 128 218 #define SHRINK_BATCH 128
219 219
220 static unsigned long 220 static unsigned long
221 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, 221 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
222 unsigned long nr_pages_scanned, unsigned long lru_pages) 222 unsigned long nr_pages_scanned, unsigned long lru_pages)
223 { 223 {
224 unsigned long freed = 0; 224 unsigned long freed = 0;
225 unsigned long long delta; 225 unsigned long long delta;
226 long total_scan; 226 long total_scan;
227 long freeable; 227 long freeable;
228 long nr; 228 long nr;
229 long new_nr; 229 long new_nr;
230 int nid = shrinkctl->nid; 230 int nid = shrinkctl->nid;
231 long batch_size = shrinker->batch ? shrinker->batch 231 long batch_size = shrinker->batch ? shrinker->batch
232 : SHRINK_BATCH; 232 : SHRINK_BATCH;
233 233
234 freeable = shrinker->count_objects(shrinker, shrinkctl); 234 freeable = shrinker->count_objects(shrinker, shrinkctl);
235 if (freeable == 0) 235 if (freeable == 0)
236 return 0; 236 return 0;
237 237
238 /* 238 /*
239 * copy the current shrinker scan count into a local variable 239 * copy the current shrinker scan count into a local variable
240 * and zero it so that other concurrent shrinker invocations 240 * and zero it so that other concurrent shrinker invocations
241 * don't also do this scanning work. 241 * don't also do this scanning work.
242 */ 242 */
243 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 243 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
244 244
245 total_scan = nr; 245 total_scan = nr;
246 delta = (4 * nr_pages_scanned) / shrinker->seeks; 246 delta = (4 * nr_pages_scanned) / shrinker->seeks;
247 delta *= freeable; 247 delta *= freeable;
248 do_div(delta, lru_pages + 1); 248 do_div(delta, lru_pages + 1);
249 total_scan += delta; 249 total_scan += delta;
250 if (total_scan < 0) { 250 if (total_scan < 0) {
251 printk(KERN_ERR 251 printk(KERN_ERR
252 "shrink_slab: %pF negative objects to delete nr=%ld\n", 252 "shrink_slab: %pF negative objects to delete nr=%ld\n",
253 shrinker->scan_objects, total_scan); 253 shrinker->scan_objects, total_scan);
254 total_scan = freeable; 254 total_scan = freeable;
255 } 255 }
256 256
257 /* 257 /*
258 * We need to avoid excessive windup on filesystem shrinkers 258 * We need to avoid excessive windup on filesystem shrinkers
259 * due to large numbers of GFP_NOFS allocations causing the 259 * due to large numbers of GFP_NOFS allocations causing the
260 * shrinkers to return -1 all the time. This results in a large 260 * shrinkers to return -1 all the time. This results in a large
261 * nr being built up so when a shrink that can do some work 261 * nr being built up so when a shrink that can do some work
262 * comes along it empties the entire cache due to nr >>> 262 * comes along it empties the entire cache due to nr >>>
263 * freeable. This is bad for sustaining a working set in 263 * freeable. This is bad for sustaining a working set in
264 * memory. 264 * memory.
265 * 265 *
266 * Hence only allow the shrinker to scan the entire cache when 266 * Hence only allow the shrinker to scan the entire cache when
267 * a large delta change is calculated directly. 267 * a large delta change is calculated directly.
268 */ 268 */
269 if (delta < freeable / 4) 269 if (delta < freeable / 4)
270 total_scan = min(total_scan, freeable / 2); 270 total_scan = min(total_scan, freeable / 2);
271 271
272 /* 272 /*
273 * Avoid risking looping forever due to too large nr value: 273 * Avoid risking looping forever due to too large nr value:
274 * never try to free more than twice the estimate number of 274 * never try to free more than twice the estimate number of
275 * freeable entries. 275 * freeable entries.
276 */ 276 */
277 if (total_scan > freeable * 2) 277 if (total_scan > freeable * 2)
278 total_scan = freeable * 2; 278 total_scan = freeable * 2;
279 279
280 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 280 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
281 nr_pages_scanned, lru_pages, 281 nr_pages_scanned, lru_pages,
282 freeable, delta, total_scan); 282 freeable, delta, total_scan);
283 283
284 /* 284 /*
285 * Normally, we should not scan less than batch_size objects in one 285 * Normally, we should not scan less than batch_size objects in one
286 * pass to avoid too frequent shrinker calls, but if the slab has less 286 * pass to avoid too frequent shrinker calls, but if the slab has less
287 * than batch_size objects in total and we are really tight on memory, 287 * than batch_size objects in total and we are really tight on memory,
288 * we will try to reclaim all available objects, otherwise we can end 288 * we will try to reclaim all available objects, otherwise we can end
289 * up failing allocations although there are plenty of reclaimable 289 * up failing allocations although there are plenty of reclaimable
290 * objects spread over several slabs with usage less than the 290 * objects spread over several slabs with usage less than the
291 * batch_size. 291 * batch_size.
292 * 292 *
293 * We detect the "tight on memory" situations by looking at the total 293 * We detect the "tight on memory" situations by looking at the total
294 * number of objects we want to scan (total_scan). If it is greater 294 * number of objects we want to scan (total_scan). If it is greater
295 * than the total number of objects on slab (freeable), we must be 295 * than the total number of objects on slab (freeable), we must be
296 * scanning at high prio and therefore should try to reclaim as much as 296 * scanning at high prio and therefore should try to reclaim as much as
297 * possible. 297 * possible.
298 */ 298 */
299 while (total_scan >= batch_size || 299 while (total_scan >= batch_size ||
300 total_scan >= freeable) { 300 total_scan >= freeable) {
301 unsigned long ret; 301 unsigned long ret;
302 unsigned long nr_to_scan = min(batch_size, total_scan); 302 unsigned long nr_to_scan = min(batch_size, total_scan);
303 303
304 shrinkctl->nr_to_scan = nr_to_scan; 304 shrinkctl->nr_to_scan = nr_to_scan;
305 ret = shrinker->scan_objects(shrinker, shrinkctl); 305 ret = shrinker->scan_objects(shrinker, shrinkctl);
306 if (ret == SHRINK_STOP) 306 if (ret == SHRINK_STOP)
307 break; 307 break;
308 freed += ret; 308 freed += ret;
309 309
310 count_vm_events(SLABS_SCANNED, nr_to_scan); 310 count_vm_events(SLABS_SCANNED, nr_to_scan);
311 total_scan -= nr_to_scan; 311 total_scan -= nr_to_scan;
312 312
313 cond_resched(); 313 cond_resched();
314 } 314 }
315 315
316 /* 316 /*
317 * move the unused scan count back into the shrinker in a 317 * move the unused scan count back into the shrinker in a
318 * manner that handles concurrent updates. If we exhausted the 318 * manner that handles concurrent updates. If we exhausted the
319 * scan, there is no need to do an update. 319 * scan, there is no need to do an update.
320 */ 320 */
321 if (total_scan > 0) 321 if (total_scan > 0)
322 new_nr = atomic_long_add_return(total_scan, 322 new_nr = atomic_long_add_return(total_scan,
323 &shrinker->nr_deferred[nid]); 323 &shrinker->nr_deferred[nid]);
324 else 324 else
325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
326 326
327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); 327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
328 return freed; 328 return freed;
329 } 329 }
330 330
331 /* 331 /*
332 * Call the shrink functions to age shrinkable caches 332 * Call the shrink functions to age shrinkable caches
333 * 333 *
334 * Here we assume it costs one seek to replace a lru page and that it also 334 * Here we assume it costs one seek to replace a lru page and that it also
335 * takes a seek to recreate a cache object. With this in mind we age equal 335 * takes a seek to recreate a cache object. With this in mind we age equal
336 * percentages of the lru and ageable caches. This should balance the seeks 336 * percentages of the lru and ageable caches. This should balance the seeks
337 * generated by these structures. 337 * generated by these structures.
338 * 338 *
339 * If the vm encountered mapped pages on the LRU it increase the pressure on 339 * If the vm encountered mapped pages on the LRU it increase the pressure on
340 * slab to avoid swapping. 340 * slab to avoid swapping.
341 * 341 *
342 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 342 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
343 * 343 *
344 * `lru_pages' represents the number of on-LRU pages in all the zones which 344 * `lru_pages' represents the number of on-LRU pages in all the zones which
345 * are eligible for the caller's allocation attempt. It is used for balancing 345 * are eligible for the caller's allocation attempt. It is used for balancing
346 * slab reclaim versus page reclaim. 346 * slab reclaim versus page reclaim.
347 * 347 *
348 * Returns the number of slab objects which we shrunk. 348 * Returns the number of slab objects which we shrunk.
349 */ 349 */
350 unsigned long shrink_slab(struct shrink_control *shrinkctl, 350 unsigned long shrink_slab(struct shrink_control *shrinkctl,
351 unsigned long nr_pages_scanned, 351 unsigned long nr_pages_scanned,
352 unsigned long lru_pages) 352 unsigned long lru_pages)
353 { 353 {
354 struct shrinker *shrinker; 354 struct shrinker *shrinker;
355 unsigned long freed = 0; 355 unsigned long freed = 0;
356 356
357 if (nr_pages_scanned == 0) 357 if (nr_pages_scanned == 0)
358 nr_pages_scanned = SWAP_CLUSTER_MAX; 358 nr_pages_scanned = SWAP_CLUSTER_MAX;
359 359
360 if (!down_read_trylock(&shrinker_rwsem)) { 360 if (!down_read_trylock(&shrinker_rwsem)) {
361 /* 361 /*
362 * If we would return 0, our callers would understand that we 362 * If we would return 0, our callers would understand that we
363 * have nothing else to shrink and give up trying. By returning 363 * have nothing else to shrink and give up trying. By returning
364 * 1 we keep it going and assume we'll be able to shrink next 364 * 1 we keep it going and assume we'll be able to shrink next
365 * time. 365 * time.
366 */ 366 */
367 freed = 1; 367 freed = 1;
368 goto out; 368 goto out;
369 } 369 }
370 370
371 list_for_each_entry(shrinker, &shrinker_list, list) { 371 list_for_each_entry(shrinker, &shrinker_list, list) {
372 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { 372 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
373 shrinkctl->nid = 0; 373 shrinkctl->nid = 0;
374 freed += shrink_slab_node(shrinkctl, shrinker, 374 freed += shrink_slab_node(shrinkctl, shrinker,
375 nr_pages_scanned, lru_pages); 375 nr_pages_scanned, lru_pages);
376 continue; 376 continue;
377 } 377 }
378 378
379 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 379 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
380 if (node_online(shrinkctl->nid)) 380 if (node_online(shrinkctl->nid))
381 freed += shrink_slab_node(shrinkctl, shrinker, 381 freed += shrink_slab_node(shrinkctl, shrinker,
382 nr_pages_scanned, lru_pages); 382 nr_pages_scanned, lru_pages);
383 383
384 } 384 }
385 } 385 }
386 up_read(&shrinker_rwsem); 386 up_read(&shrinker_rwsem);
387 out: 387 out:
388 cond_resched(); 388 cond_resched();
389 return freed; 389 return freed;
390 } 390 }
391 391
392 static inline int is_page_cache_freeable(struct page *page) 392 static inline int is_page_cache_freeable(struct page *page)
393 { 393 {
394 /* 394 /*
395 * A freeable page cache page is referenced only by the caller 395 * A freeable page cache page is referenced only by the caller
396 * that isolated the page, the page cache radix tree and 396 * that isolated the page, the page cache radix tree and
397 * optional buffer heads at page->private. 397 * optional buffer heads at page->private.
398 */ 398 */
399 return page_count(page) - page_has_private(page) == 2; 399 return page_count(page) - page_has_private(page) == 2;
400 } 400 }
401 401
402 static int may_write_to_queue(struct backing_dev_info *bdi, 402 static int may_write_to_queue(struct backing_dev_info *bdi,
403 struct scan_control *sc) 403 struct scan_control *sc)
404 { 404 {
405 if (current->flags & PF_SWAPWRITE) 405 if (current->flags & PF_SWAPWRITE)
406 return 1; 406 return 1;
407 if (!bdi_write_congested(bdi)) 407 if (!bdi_write_congested(bdi))
408 return 1; 408 return 1;
409 if (bdi == current->backing_dev_info) 409 if (bdi == current->backing_dev_info)
410 return 1; 410 return 1;
411 return 0; 411 return 0;
412 } 412 }
413 413
414 /* 414 /*
415 * We detected a synchronous write error writing a page out. Probably 415 * We detected a synchronous write error writing a page out. Probably
416 * -ENOSPC. We need to propagate that into the address_space for a subsequent 416 * -ENOSPC. We need to propagate that into the address_space for a subsequent
417 * fsync(), msync() or close(). 417 * fsync(), msync() or close().
418 * 418 *
419 * The tricky part is that after writepage we cannot touch the mapping: nothing 419 * The tricky part is that after writepage we cannot touch the mapping: nothing
420 * prevents it from being freed up. But we have a ref on the page and once 420 * prevents it from being freed up. But we have a ref on the page and once
421 * that page is locked, the mapping is pinned. 421 * that page is locked, the mapping is pinned.
422 * 422 *
423 * We're allowed to run sleeping lock_page() here because we know the caller has 423 * We're allowed to run sleeping lock_page() here because we know the caller has
424 * __GFP_FS. 424 * __GFP_FS.
425 */ 425 */
426 static void handle_write_error(struct address_space *mapping, 426 static void handle_write_error(struct address_space *mapping,
427 struct page *page, int error) 427 struct page *page, int error)
428 { 428 {
429 lock_page(page); 429 lock_page(page);
430 if (page_mapping(page) == mapping) 430 if (page_mapping(page) == mapping)
431 mapping_set_error(mapping, error); 431 mapping_set_error(mapping, error);
432 unlock_page(page); 432 unlock_page(page);
433 } 433 }
434 434
435 /* possible outcome of pageout() */ 435 /* possible outcome of pageout() */
436 typedef enum { 436 typedef enum {
437 /* failed to write page out, page is locked */ 437 /* failed to write page out, page is locked */
438 PAGE_KEEP, 438 PAGE_KEEP,
439 /* move page to the active list, page is locked */ 439 /* move page to the active list, page is locked */
440 PAGE_ACTIVATE, 440 PAGE_ACTIVATE,
441 /* page has been sent to the disk successfully, page is unlocked */ 441 /* page has been sent to the disk successfully, page is unlocked */
442 PAGE_SUCCESS, 442 PAGE_SUCCESS,
443 /* page is clean and locked */ 443 /* page is clean and locked */
444 PAGE_CLEAN, 444 PAGE_CLEAN,
445 } pageout_t; 445 } pageout_t;
446 446
447 /* 447 /*
448 * pageout is called by shrink_page_list() for each dirty page. 448 * pageout is called by shrink_page_list() for each dirty page.
449 * Calls ->writepage(). 449 * Calls ->writepage().
450 */ 450 */
451 static pageout_t pageout(struct page *page, struct address_space *mapping, 451 static pageout_t pageout(struct page *page, struct address_space *mapping,
452 struct scan_control *sc) 452 struct scan_control *sc)
453 { 453 {
454 /* 454 /*
455 * If the page is dirty, only perform writeback if that write 455 * If the page is dirty, only perform writeback if that write
456 * will be non-blocking. To prevent this allocation from being 456 * will be non-blocking. To prevent this allocation from being
457 * stalled by pagecache activity. But note that there may be 457 * stalled by pagecache activity. But note that there may be
458 * stalls if we need to run get_block(). We could test 458 * stalls if we need to run get_block(). We could test
459 * PagePrivate for that. 459 * PagePrivate for that.
460 * 460 *
461 * If this process is currently in __generic_file_aio_write() against 461 * If this process is currently in __generic_file_aio_write() against
462 * this page's queue, we can perform writeback even if that 462 * this page's queue, we can perform writeback even if that
463 * will block. 463 * will block.
464 * 464 *
465 * If the page is swapcache, write it back even if that would 465 * If the page is swapcache, write it back even if that would
466 * block, for some throttling. This happens by accident, because 466 * block, for some throttling. This happens by accident, because
467 * swap_backing_dev_info is bust: it doesn't reflect the 467 * swap_backing_dev_info is bust: it doesn't reflect the
468 * congestion state of the swapdevs. Easy to fix, if needed. 468 * congestion state of the swapdevs. Easy to fix, if needed.
469 */ 469 */
470 if (!is_page_cache_freeable(page)) 470 if (!is_page_cache_freeable(page))
471 return PAGE_KEEP; 471 return PAGE_KEEP;
472 if (!mapping) { 472 if (!mapping) {
473 /* 473 /*
474 * Some data journaling orphaned pages can have 474 * Some data journaling orphaned pages can have
475 * page->mapping == NULL while being dirty with clean buffers. 475 * page->mapping == NULL while being dirty with clean buffers.
476 */ 476 */
477 if (page_has_private(page)) { 477 if (page_has_private(page)) {
478 if (try_to_free_buffers(page)) { 478 if (try_to_free_buffers(page)) {
479 ClearPageDirty(page); 479 ClearPageDirty(page);
480 printk("%s: orphaned page\n", __func__); 480 printk("%s: orphaned page\n", __func__);
481 return PAGE_CLEAN; 481 return PAGE_CLEAN;
482 } 482 }
483 } 483 }
484 return PAGE_KEEP; 484 return PAGE_KEEP;
485 } 485 }
486 if (mapping->a_ops->writepage == NULL) 486 if (mapping->a_ops->writepage == NULL)
487 return PAGE_ACTIVATE; 487 return PAGE_ACTIVATE;
488 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 488 if (!may_write_to_queue(mapping->backing_dev_info, sc))
489 return PAGE_KEEP; 489 return PAGE_KEEP;
490 490
491 if (clear_page_dirty_for_io(page)) { 491 if (clear_page_dirty_for_io(page)) {
492 int res; 492 int res;
493 struct writeback_control wbc = { 493 struct writeback_control wbc = {
494 .sync_mode = WB_SYNC_NONE, 494 .sync_mode = WB_SYNC_NONE,
495 .nr_to_write = SWAP_CLUSTER_MAX, 495 .nr_to_write = SWAP_CLUSTER_MAX,
496 .range_start = 0, 496 .range_start = 0,
497 .range_end = LLONG_MAX, 497 .range_end = LLONG_MAX,
498 .for_reclaim = 1, 498 .for_reclaim = 1,
499 }; 499 };
500 500
501 SetPageReclaim(page); 501 SetPageReclaim(page);
502 res = mapping->a_ops->writepage(page, &wbc); 502 res = mapping->a_ops->writepage(page, &wbc);
503 if (res < 0) 503 if (res < 0)
504 handle_write_error(mapping, page, res); 504 handle_write_error(mapping, page, res);
505 if (res == AOP_WRITEPAGE_ACTIVATE) { 505 if (res == AOP_WRITEPAGE_ACTIVATE) {
506 ClearPageReclaim(page); 506 ClearPageReclaim(page);
507 return PAGE_ACTIVATE; 507 return PAGE_ACTIVATE;
508 } 508 }
509 509
510 if (!PageWriteback(page)) { 510 if (!PageWriteback(page)) {
511 /* synchronous write or broken a_ops? */ 511 /* synchronous write or broken a_ops? */
512 ClearPageReclaim(page); 512 ClearPageReclaim(page);
513 } 513 }
514 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); 514 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
515 inc_zone_page_state(page, NR_VMSCAN_WRITE); 515 inc_zone_page_state(page, NR_VMSCAN_WRITE);
516 return PAGE_SUCCESS; 516 return PAGE_SUCCESS;
517 } 517 }
518 518
519 return PAGE_CLEAN; 519 return PAGE_CLEAN;
520 } 520 }
521 521
522 /* 522 /*
523 * Same as remove_mapping, but if the page is removed from the mapping, it 523 * Same as remove_mapping, but if the page is removed from the mapping, it
524 * gets returned with a refcount of 0. 524 * gets returned with a refcount of 0.
525 */ 525 */
526 static int __remove_mapping(struct address_space *mapping, struct page *page) 526 static int __remove_mapping(struct address_space *mapping, struct page *page)
527 { 527 {
528 BUG_ON(!PageLocked(page)); 528 BUG_ON(!PageLocked(page));
529 BUG_ON(mapping != page_mapping(page)); 529 BUG_ON(mapping != page_mapping(page));
530 530
531 spin_lock_irq(&mapping->tree_lock); 531 spin_lock_irq(&mapping->tree_lock);
532 /* 532 /*
533 * The non racy check for a busy page. 533 * The non racy check for a busy page.
534 * 534 *
535 * Must be careful with the order of the tests. When someone has 535 * Must be careful with the order of the tests. When someone has
536 * a ref to the page, it may be possible that they dirty it then 536 * a ref to the page, it may be possible that they dirty it then
537 * drop the reference. So if PageDirty is tested before page_count 537 * drop the reference. So if PageDirty is tested before page_count
538 * here, then the following race may occur: 538 * here, then the following race may occur:
539 * 539 *
540 * get_user_pages(&page); 540 * get_user_pages(&page);
541 * [user mapping goes away] 541 * [user mapping goes away]
542 * write_to(page); 542 * write_to(page);
543 * !PageDirty(page) [good] 543 * !PageDirty(page) [good]
544 * SetPageDirty(page); 544 * SetPageDirty(page);
545 * put_page(page); 545 * put_page(page);
546 * !page_count(page) [good, discard it] 546 * !page_count(page) [good, discard it]
547 * 547 *
548 * [oops, our write_to data is lost] 548 * [oops, our write_to data is lost]
549 * 549 *
550 * Reversing the order of the tests ensures such a situation cannot 550 * Reversing the order of the tests ensures such a situation cannot
551 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 551 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
552 * load is not satisfied before that of page->_count. 552 * load is not satisfied before that of page->_count.
553 * 553 *
554 * Note that if SetPageDirty is always performed via set_page_dirty, 554 * Note that if SetPageDirty is always performed via set_page_dirty,
555 * and thus under tree_lock, then this ordering is not required. 555 * and thus under tree_lock, then this ordering is not required.
556 */ 556 */
557 if (!page_freeze_refs(page, 2)) 557 if (!page_freeze_refs(page, 2))
558 goto cannot_free; 558 goto cannot_free;
559 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 559 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
560 if (unlikely(PageDirty(page))) { 560 if (unlikely(PageDirty(page))) {
561 page_unfreeze_refs(page, 2); 561 page_unfreeze_refs(page, 2);
562 goto cannot_free; 562 goto cannot_free;
563 } 563 }
564 564
565 if (PageSwapCache(page)) { 565 if (PageSwapCache(page)) {
566 swp_entry_t swap = { .val = page_private(page) }; 566 swp_entry_t swap = { .val = page_private(page) };
567 __delete_from_swap_cache(page); 567 __delete_from_swap_cache(page);
568 spin_unlock_irq(&mapping->tree_lock); 568 spin_unlock_irq(&mapping->tree_lock);
569 swapcache_free(swap, page); 569 swapcache_free(swap, page);
570 } else { 570 } else {
571 void (*freepage)(struct page *); 571 void (*freepage)(struct page *);
572 572
573 freepage = mapping->a_ops->freepage; 573 freepage = mapping->a_ops->freepage;
574 574
575 __delete_from_page_cache(page); 575 __delete_from_page_cache(page);
576 spin_unlock_irq(&mapping->tree_lock); 576 spin_unlock_irq(&mapping->tree_lock);
577 mem_cgroup_uncharge_cache_page(page); 577 mem_cgroup_uncharge_cache_page(page);
578 578
579 if (freepage != NULL) 579 if (freepage != NULL)
580 freepage(page); 580 freepage(page);
581 } 581 }
582 582
583 return 1; 583 return 1;
584 584
585 cannot_free: 585 cannot_free:
586 spin_unlock_irq(&mapping->tree_lock); 586 spin_unlock_irq(&mapping->tree_lock);
587 return 0; 587 return 0;
588 } 588 }
589 589
590 /* 590 /*
591 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 591 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
592 * someone else has a ref on the page, abort and return 0. If it was 592 * someone else has a ref on the page, abort and return 0. If it was
593 * successfully detached, return 1. Assumes the caller has a single ref on 593 * successfully detached, return 1. Assumes the caller has a single ref on
594 * this page. 594 * this page.
595 */ 595 */
596 int remove_mapping(struct address_space *mapping, struct page *page) 596 int remove_mapping(struct address_space *mapping, struct page *page)
597 { 597 {
598 if (__remove_mapping(mapping, page)) { 598 if (__remove_mapping(mapping, page)) {
599 /* 599 /*
600 * Unfreezing the refcount with 1 rather than 2 effectively 600 * Unfreezing the refcount with 1 rather than 2 effectively
601 * drops the pagecache ref for us without requiring another 601 * drops the pagecache ref for us without requiring another
602 * atomic operation. 602 * atomic operation.
603 */ 603 */
604 page_unfreeze_refs(page, 1); 604 page_unfreeze_refs(page, 1);
605 return 1; 605 return 1;
606 } 606 }
607 return 0; 607 return 0;
608 } 608 }
609 609
610 /** 610 /**
611 * putback_lru_page - put previously isolated page onto appropriate LRU list 611 * putback_lru_page - put previously isolated page onto appropriate LRU list
612 * @page: page to be put back to appropriate lru list 612 * @page: page to be put back to appropriate lru list
613 * 613 *
614 * Add previously isolated @page to appropriate LRU list. 614 * Add previously isolated @page to appropriate LRU list.
615 * Page may still be unevictable for other reasons. 615 * Page may still be unevictable for other reasons.
616 * 616 *
617 * lru_lock must not be held, interrupts must be enabled. 617 * lru_lock must not be held, interrupts must be enabled.
618 */ 618 */
619 void putback_lru_page(struct page *page) 619 void putback_lru_page(struct page *page)
620 { 620 {
621 bool is_unevictable; 621 bool is_unevictable;
622 int was_unevictable = PageUnevictable(page); 622 int was_unevictable = PageUnevictable(page);
623 623
624 VM_BUG_ON(PageLRU(page)); 624 VM_BUG_ON(PageLRU(page));
625 625
626 redo: 626 redo:
627 ClearPageUnevictable(page); 627 ClearPageUnevictable(page);
628 628
629 if (page_evictable(page)) { 629 if (page_evictable(page)) {
630 /* 630 /*
631 * For evictable pages, we can use the cache. 631 * For evictable pages, we can use the cache.
632 * In event of a race, worst case is we end up with an 632 * In event of a race, worst case is we end up with an
633 * unevictable page on [in]active list. 633 * unevictable page on [in]active list.
634 * We know how to handle that. 634 * We know how to handle that.
635 */ 635 */
636 is_unevictable = false; 636 is_unevictable = false;
637 lru_cache_add(page); 637 lru_cache_add(page);
638 } else { 638 } else {
639 /* 639 /*
640 * Put unevictable pages directly on zone's unevictable 640 * Put unevictable pages directly on zone's unevictable
641 * list. 641 * list.
642 */ 642 */
643 is_unevictable = true; 643 is_unevictable = true;
644 add_page_to_unevictable_list(page); 644 add_page_to_unevictable_list(page);
645 /* 645 /*
646 * When racing with an mlock or AS_UNEVICTABLE clearing 646 * When racing with an mlock or AS_UNEVICTABLE clearing
647 * (page is unlocked) make sure that if the other thread 647 * (page is unlocked) make sure that if the other thread
648 * does not observe our setting of PG_lru and fails 648 * does not observe our setting of PG_lru and fails
649 * isolation/check_move_unevictable_pages, 649 * isolation/check_move_unevictable_pages,
650 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 650 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
651 * the page back to the evictable list. 651 * the page back to the evictable list.
652 * 652 *
653 * The other side is TestClearPageMlocked() or shmem_lock(). 653 * The other side is TestClearPageMlocked() or shmem_lock().
654 */ 654 */
655 smp_mb(); 655 smp_mb();
656 } 656 }
657 657
658 /* 658 /*
659 * page's status can change while we move it among lru. If an evictable 659 * page's status can change while we move it among lru. If an evictable
660 * page is on unevictable list, it never be freed. To avoid that, 660 * page is on unevictable list, it never be freed. To avoid that,
661 * check after we added it to the list, again. 661 * check after we added it to the list, again.
662 */ 662 */
663 if (is_unevictable && page_evictable(page)) { 663 if (is_unevictable && page_evictable(page)) {
664 if (!isolate_lru_page(page)) { 664 if (!isolate_lru_page(page)) {
665 put_page(page); 665 put_page(page);
666 goto redo; 666 goto redo;
667 } 667 }
668 /* This means someone else dropped this page from LRU 668 /* This means someone else dropped this page from LRU
669 * So, it will be freed or putback to LRU again. There is 669 * So, it will be freed or putback to LRU again. There is
670 * nothing to do here. 670 * nothing to do here.
671 */ 671 */
672 } 672 }
673 673
674 if (was_unevictable && !is_unevictable) 674 if (was_unevictable && !is_unevictable)
675 count_vm_event(UNEVICTABLE_PGRESCUED); 675 count_vm_event(UNEVICTABLE_PGRESCUED);
676 else if (!was_unevictable && is_unevictable) 676 else if (!was_unevictable && is_unevictable)
677 count_vm_event(UNEVICTABLE_PGCULLED); 677 count_vm_event(UNEVICTABLE_PGCULLED);
678 678
679 put_page(page); /* drop ref from isolate */ 679 put_page(page); /* drop ref from isolate */
680 } 680 }
681 681
682 enum page_references { 682 enum page_references {
683 PAGEREF_RECLAIM, 683 PAGEREF_RECLAIM,
684 PAGEREF_RECLAIM_CLEAN, 684 PAGEREF_RECLAIM_CLEAN,
685 PAGEREF_KEEP, 685 PAGEREF_KEEP,
686 PAGEREF_ACTIVATE, 686 PAGEREF_ACTIVATE,
687 }; 687 };
688 688
689 static enum page_references page_check_references(struct page *page, 689 static enum page_references page_check_references(struct page *page,
690 struct scan_control *sc) 690 struct scan_control *sc)
691 { 691 {
692 int referenced_ptes, referenced_page; 692 int referenced_ptes, referenced_page;
693 unsigned long vm_flags; 693 unsigned long vm_flags;
694 694
695 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, 695 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
696 &vm_flags); 696 &vm_flags);
697 referenced_page = TestClearPageReferenced(page); 697 referenced_page = TestClearPageReferenced(page);
698 698
699 /* 699 /*
700 * Mlock lost the isolation race with us. Let try_to_unmap() 700 * Mlock lost the isolation race with us. Let try_to_unmap()
701 * move the page to the unevictable list. 701 * move the page to the unevictable list.
702 */ 702 */
703 if (vm_flags & VM_LOCKED) 703 if (vm_flags & VM_LOCKED)
704 return PAGEREF_RECLAIM; 704 return PAGEREF_RECLAIM;
705 705
706 if (referenced_ptes) { 706 if (referenced_ptes) {
707 if (PageSwapBacked(page)) 707 if (PageSwapBacked(page))
708 return PAGEREF_ACTIVATE; 708 return PAGEREF_ACTIVATE;
709 /* 709 /*
710 * All mapped pages start out with page table 710 * All mapped pages start out with page table
711 * references from the instantiating fault, so we need 711 * references from the instantiating fault, so we need
712 * to look twice if a mapped file page is used more 712 * to look twice if a mapped file page is used more
713 * than once. 713 * than once.
714 * 714 *
715 * Mark it and spare it for another trip around the 715 * Mark it and spare it for another trip around the
716 * inactive list. Another page table reference will 716 * inactive list. Another page table reference will
717 * lead to its activation. 717 * lead to its activation.
718 * 718 *
719 * Note: the mark is set for activated pages as well 719 * Note: the mark is set for activated pages as well
720 * so that recently deactivated but used pages are 720 * so that recently deactivated but used pages are
721 * quickly recovered. 721 * quickly recovered.
722 */ 722 */
723 SetPageReferenced(page); 723 SetPageReferenced(page);
724 724
725 if (referenced_page || referenced_ptes > 1) 725 if (referenced_page || referenced_ptes > 1)
726 return PAGEREF_ACTIVATE; 726 return PAGEREF_ACTIVATE;
727 727
728 /* 728 /*
729 * Activate file-backed executable pages after first usage. 729 * Activate file-backed executable pages after first usage.
730 */ 730 */
731 if (vm_flags & VM_EXEC) 731 if (vm_flags & VM_EXEC)
732 return PAGEREF_ACTIVATE; 732 return PAGEREF_ACTIVATE;
733 733
734 return PAGEREF_KEEP; 734 return PAGEREF_KEEP;
735 } 735 }
736 736
737 /* Reclaim if clean, defer dirty pages to writeback */ 737 /* Reclaim if clean, defer dirty pages to writeback */
738 if (referenced_page && !PageSwapBacked(page)) 738 if (referenced_page && !PageSwapBacked(page))
739 return PAGEREF_RECLAIM_CLEAN; 739 return PAGEREF_RECLAIM_CLEAN;
740 740
741 return PAGEREF_RECLAIM; 741 return PAGEREF_RECLAIM;
742 } 742 }
743 743
744 /* Check if a page is dirty or under writeback */ 744 /* Check if a page is dirty or under writeback */
745 static void page_check_dirty_writeback(struct page *page, 745 static void page_check_dirty_writeback(struct page *page,
746 bool *dirty, bool *writeback) 746 bool *dirty, bool *writeback)
747 { 747 {
748 struct address_space *mapping; 748 struct address_space *mapping;
749 749
750 /* 750 /*
751 * Anonymous pages are not handled by flushers and must be written 751 * Anonymous pages are not handled by flushers and must be written
752 * from reclaim context. Do not stall reclaim based on them 752 * from reclaim context. Do not stall reclaim based on them
753 */ 753 */
754 if (!page_is_file_cache(page)) { 754 if (!page_is_file_cache(page)) {
755 *dirty = false; 755 *dirty = false;
756 *writeback = false; 756 *writeback = false;
757 return; 757 return;
758 } 758 }
759 759
760 /* By default assume that the page flags are accurate */ 760 /* By default assume that the page flags are accurate */
761 *dirty = PageDirty(page); 761 *dirty = PageDirty(page);
762 *writeback = PageWriteback(page); 762 *writeback = PageWriteback(page);
763 763
764 /* Verify dirty/writeback state if the filesystem supports it */ 764 /* Verify dirty/writeback state if the filesystem supports it */
765 if (!page_has_private(page)) 765 if (!page_has_private(page))
766 return; 766 return;
767 767
768 mapping = page_mapping(page); 768 mapping = page_mapping(page);
769 if (mapping && mapping->a_ops->is_dirty_writeback) 769 if (mapping && mapping->a_ops->is_dirty_writeback)
770 mapping->a_ops->is_dirty_writeback(page, dirty, writeback); 770 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
771 } 771 }
772 772
773 /* 773 /*
774 * shrink_page_list() returns the number of reclaimed pages 774 * shrink_page_list() returns the number of reclaimed pages
775 */ 775 */
776 static unsigned long shrink_page_list(struct list_head *page_list, 776 static unsigned long shrink_page_list(struct list_head *page_list,
777 struct zone *zone, 777 struct zone *zone,
778 struct scan_control *sc, 778 struct scan_control *sc,
779 enum ttu_flags ttu_flags, 779 enum ttu_flags ttu_flags,
780 unsigned long *ret_nr_dirty, 780 unsigned long *ret_nr_dirty,
781 unsigned long *ret_nr_unqueued_dirty, 781 unsigned long *ret_nr_unqueued_dirty,
782 unsigned long *ret_nr_congested, 782 unsigned long *ret_nr_congested,
783 unsigned long *ret_nr_writeback, 783 unsigned long *ret_nr_writeback,
784 unsigned long *ret_nr_immediate, 784 unsigned long *ret_nr_immediate,
785 bool force_reclaim) 785 bool force_reclaim)
786 { 786 {
787 LIST_HEAD(ret_pages); 787 LIST_HEAD(ret_pages);
788 LIST_HEAD(free_pages); 788 LIST_HEAD(free_pages);
789 int pgactivate = 0; 789 int pgactivate = 0;
790 unsigned long nr_unqueued_dirty = 0; 790 unsigned long nr_unqueued_dirty = 0;
791 unsigned long nr_dirty = 0; 791 unsigned long nr_dirty = 0;
792 unsigned long nr_congested = 0; 792 unsigned long nr_congested = 0;
793 unsigned long nr_reclaimed = 0; 793 unsigned long nr_reclaimed = 0;
794 unsigned long nr_writeback = 0; 794 unsigned long nr_writeback = 0;
795 unsigned long nr_immediate = 0; 795 unsigned long nr_immediate = 0;
796 796
797 cond_resched(); 797 cond_resched();
798 798
799 mem_cgroup_uncharge_start(); 799 mem_cgroup_uncharge_start();
800 while (!list_empty(page_list)) { 800 while (!list_empty(page_list)) {
801 struct address_space *mapping; 801 struct address_space *mapping;
802 struct page *page; 802 struct page *page;
803 int may_enter_fs; 803 int may_enter_fs;
804 enum page_references references = PAGEREF_RECLAIM_CLEAN; 804 enum page_references references = PAGEREF_RECLAIM_CLEAN;
805 bool dirty, writeback; 805 bool dirty, writeback;
806 806
807 cond_resched(); 807 cond_resched();
808 808
809 page = lru_to_page(page_list); 809 page = lru_to_page(page_list);
810 list_del(&page->lru); 810 list_del(&page->lru);
811 811
812 if (!trylock_page(page)) 812 if (!trylock_page(page))
813 goto keep; 813 goto keep;
814 814
815 VM_BUG_ON(PageActive(page)); 815 VM_BUG_ON(PageActive(page));
816 VM_BUG_ON(page_zone(page) != zone); 816 VM_BUG_ON(page_zone(page) != zone);
817 817
818 sc->nr_scanned++; 818 sc->nr_scanned++;
819 819
820 if (unlikely(!page_evictable(page))) 820 if (unlikely(!page_evictable(page)))
821 goto cull_mlocked; 821 goto cull_mlocked;
822 822
823 if (!sc->may_unmap && page_mapped(page)) 823 if (!sc->may_unmap && page_mapped(page))
824 goto keep_locked; 824 goto keep_locked;
825 825
826 /* Double the slab pressure for mapped and swapcache pages */ 826 /* Double the slab pressure for mapped and swapcache pages */
827 if (page_mapped(page) || PageSwapCache(page)) 827 if (page_mapped(page) || PageSwapCache(page))
828 sc->nr_scanned++; 828 sc->nr_scanned++;
829 829
830 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 830 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
831 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 831 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
832 832
833 /* 833 /*
834 * The number of dirty pages determines if a zone is marked 834 * The number of dirty pages determines if a zone is marked
835 * reclaim_congested which affects wait_iff_congested. kswapd 835 * reclaim_congested which affects wait_iff_congested. kswapd
836 * will stall and start writing pages if the tail of the LRU 836 * will stall and start writing pages if the tail of the LRU
837 * is all dirty unqueued pages. 837 * is all dirty unqueued pages.
838 */ 838 */
839 page_check_dirty_writeback(page, &dirty, &writeback); 839 page_check_dirty_writeback(page, &dirty, &writeback);
840 if (dirty || writeback) 840 if (dirty || writeback)
841 nr_dirty++; 841 nr_dirty++;
842 842
843 if (dirty && !writeback) 843 if (dirty && !writeback)
844 nr_unqueued_dirty++; 844 nr_unqueued_dirty++;
845 845
846 /* 846 /*
847 * Treat this page as congested if the underlying BDI is or if 847 * Treat this page as congested if the underlying BDI is or if
848 * pages are cycling through the LRU so quickly that the 848 * pages are cycling through the LRU so quickly that the
849 * pages marked for immediate reclaim are making it to the 849 * pages marked for immediate reclaim are making it to the
850 * end of the LRU a second time. 850 * end of the LRU a second time.
851 */ 851 */
852 mapping = page_mapping(page); 852 mapping = page_mapping(page);
853 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || 853 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
854 (writeback && PageReclaim(page))) 854 (writeback && PageReclaim(page)))
855 nr_congested++; 855 nr_congested++;
856 856
857 /* 857 /*
858 * If a page at the tail of the LRU is under writeback, there 858 * If a page at the tail of the LRU is under writeback, there
859 * are three cases to consider. 859 * are three cases to consider.
860 * 860 *
861 * 1) If reclaim is encountering an excessive number of pages 861 * 1) If reclaim is encountering an excessive number of pages
862 * under writeback and this page is both under writeback and 862 * under writeback and this page is both under writeback and
863 * PageReclaim then it indicates that pages are being queued 863 * PageReclaim then it indicates that pages are being queued
864 * for IO but are being recycled through the LRU before the 864 * for IO but are being recycled through the LRU before the
865 * IO can complete. Waiting on the page itself risks an 865 * IO can complete. Waiting on the page itself risks an
866 * indefinite stall if it is impossible to writeback the 866 * indefinite stall if it is impossible to writeback the
867 * page due to IO error or disconnected storage so instead 867 * page due to IO error or disconnected storage so instead
868 * note that the LRU is being scanned too quickly and the 868 * note that the LRU is being scanned too quickly and the
869 * caller can stall after page list has been processed. 869 * caller can stall after page list has been processed.
870 * 870 *
871 * 2) Global reclaim encounters a page, memcg encounters a 871 * 2) Global reclaim encounters a page, memcg encounters a
872 * page that is not marked for immediate reclaim or 872 * page that is not marked for immediate reclaim or
873 * the caller does not have __GFP_IO. In this case mark 873 * the caller does not have __GFP_IO. In this case mark
874 * the page for immediate reclaim and continue scanning. 874 * the page for immediate reclaim and continue scanning.
875 * 875 *
876 * __GFP_IO is checked because a loop driver thread might 876 * __GFP_IO is checked because a loop driver thread might
877 * enter reclaim, and deadlock if it waits on a page for 877 * enter reclaim, and deadlock if it waits on a page for
878 * which it is needed to do the write (loop masks off 878 * which it is needed to do the write (loop masks off
879 * __GFP_IO|__GFP_FS for this reason); but more thought 879 * __GFP_IO|__GFP_FS for this reason); but more thought
880 * would probably show more reasons. 880 * would probably show more reasons.
881 * 881 *
882 * Don't require __GFP_FS, since we're not going into the 882 * Don't require __GFP_FS, since we're not going into the
883 * FS, just waiting on its writeback completion. Worryingly, 883 * FS, just waiting on its writeback completion. Worryingly,
884 * ext4 gfs2 and xfs allocate pages with 884 * ext4 gfs2 and xfs allocate pages with
885 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing 885 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
886 * may_enter_fs here is liable to OOM on them. 886 * may_enter_fs here is liable to OOM on them.
887 * 887 *
888 * 3) memcg encounters a page that is not already marked 888 * 3) memcg encounters a page that is not already marked
889 * PageReclaim. memcg does not have any dirty pages 889 * PageReclaim. memcg does not have any dirty pages
890 * throttling so we could easily OOM just because too many 890 * throttling so we could easily OOM just because too many
891 * pages are in writeback and there is nothing else to 891 * pages are in writeback and there is nothing else to
892 * reclaim. Wait for the writeback to complete. 892 * reclaim. Wait for the writeback to complete.
893 */ 893 */
894 if (PageWriteback(page)) { 894 if (PageWriteback(page)) {
895 /* Case 1 above */ 895 /* Case 1 above */
896 if (current_is_kswapd() && 896 if (current_is_kswapd() &&
897 PageReclaim(page) && 897 PageReclaim(page) &&
898 zone_is_reclaim_writeback(zone)) { 898 zone_is_reclaim_writeback(zone)) {
899 nr_immediate++; 899 nr_immediate++;
900 goto keep_locked; 900 goto keep_locked;
901 901
902 /* Case 2 above */ 902 /* Case 2 above */
903 } else if (global_reclaim(sc) || 903 } else if (global_reclaim(sc) ||
904 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 904 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
905 /* 905 /*
906 * This is slightly racy - end_page_writeback() 906 * This is slightly racy - end_page_writeback()
907 * might have just cleared PageReclaim, then 907 * might have just cleared PageReclaim, then
908 * setting PageReclaim here end up interpreted 908 * setting PageReclaim here end up interpreted
909 * as PageReadahead - but that does not matter 909 * as PageReadahead - but that does not matter
910 * enough to care. What we do want is for this 910 * enough to care. What we do want is for this
911 * page to have PageReclaim set next time memcg 911 * page to have PageReclaim set next time memcg
912 * reclaim reaches the tests above, so it will 912 * reclaim reaches the tests above, so it will
913 * then wait_on_page_writeback() to avoid OOM; 913 * then wait_on_page_writeback() to avoid OOM;
914 * and it's also appropriate in global reclaim. 914 * and it's also appropriate in global reclaim.
915 */ 915 */
916 SetPageReclaim(page); 916 SetPageReclaim(page);
917 nr_writeback++; 917 nr_writeback++;
918 918
919 goto keep_locked; 919 goto keep_locked;
920 920
921 /* Case 3 above */ 921 /* Case 3 above */
922 } else { 922 } else {
923 wait_on_page_writeback(page); 923 wait_on_page_writeback(page);
924 } 924 }
925 } 925 }
926 926
927 if (!force_reclaim) 927 if (!force_reclaim)
928 references = page_check_references(page, sc); 928 references = page_check_references(page, sc);
929 929
930 switch (references) { 930 switch (references) {
931 case PAGEREF_ACTIVATE: 931 case PAGEREF_ACTIVATE:
932 goto activate_locked; 932 goto activate_locked;
933 case PAGEREF_KEEP: 933 case PAGEREF_KEEP:
934 goto keep_locked; 934 goto keep_locked;
935 case PAGEREF_RECLAIM: 935 case PAGEREF_RECLAIM:
936 case PAGEREF_RECLAIM_CLEAN: 936 case PAGEREF_RECLAIM_CLEAN:
937 ; /* try to reclaim the page below */ 937 ; /* try to reclaim the page below */
938 } 938 }
939 939
940 /* 940 /*
941 * Anonymous process memory has backing store? 941 * Anonymous process memory has backing store?
942 * Try to allocate it some swap space here. 942 * Try to allocate it some swap space here.
943 */ 943 */
944 if (PageAnon(page) && !PageSwapCache(page)) { 944 if (PageAnon(page) && !PageSwapCache(page)) {
945 if (!(sc->gfp_mask & __GFP_IO)) 945 if (!(sc->gfp_mask & __GFP_IO))
946 goto keep_locked; 946 goto keep_locked;
947 if (!add_to_swap(page, page_list)) 947 if (!add_to_swap(page, page_list))
948 goto activate_locked; 948 goto activate_locked;
949 may_enter_fs = 1; 949 may_enter_fs = 1;
950 950
951 /* Adding to swap updated mapping */ 951 /* Adding to swap updated mapping */
952 mapping = page_mapping(page); 952 mapping = page_mapping(page);
953 } 953 }
954 954
955 /* 955 /*
956 * The page is mapped into the page tables of one or more 956 * The page is mapped into the page tables of one or more
957 * processes. Try to unmap it here. 957 * processes. Try to unmap it here.
958 */ 958 */
959 if (page_mapped(page) && mapping) { 959 if (page_mapped(page) && mapping) {
960 switch (try_to_unmap(page, ttu_flags)) { 960 switch (try_to_unmap(page, ttu_flags)) {
961 case SWAP_FAIL: 961 case SWAP_FAIL:
962 goto activate_locked; 962 goto activate_locked;
963 case SWAP_AGAIN: 963 case SWAP_AGAIN:
964 goto keep_locked; 964 goto keep_locked;
965 case SWAP_MLOCK: 965 case SWAP_MLOCK:
966 goto cull_mlocked; 966 goto cull_mlocked;
967 case SWAP_SUCCESS: 967 case SWAP_SUCCESS:
968 ; /* try to free the page below */ 968 ; /* try to free the page below */
969 } 969 }
970 } 970 }
971 971
972 if (PageDirty(page)) { 972 if (PageDirty(page)) {
973 /* 973 /*
974 * Only kswapd can writeback filesystem pages to 974 * Only kswapd can writeback filesystem pages to
975 * avoid risk of stack overflow but only writeback 975 * avoid risk of stack overflow but only writeback
976 * if many dirty pages have been encountered. 976 * if many dirty pages have been encountered.
977 */ 977 */
978 if (page_is_file_cache(page) && 978 if (page_is_file_cache(page) &&
979 (!current_is_kswapd() || 979 (!current_is_kswapd() ||
980 !zone_is_reclaim_dirty(zone))) { 980 !zone_is_reclaim_dirty(zone))) {
981 /* 981 /*
982 * Immediately reclaim when written back. 982 * Immediately reclaim when written back.
983 * Similar in principal to deactivate_page() 983 * Similar in principal to deactivate_page()
984 * except we already have the page isolated 984 * except we already have the page isolated
985 * and know it's dirty 985 * and know it's dirty
986 */ 986 */
987 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); 987 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
988 SetPageReclaim(page); 988 SetPageReclaim(page);
989 989
990 goto keep_locked; 990 goto keep_locked;
991 } 991 }
992 992
993 if (references == PAGEREF_RECLAIM_CLEAN) 993 if (references == PAGEREF_RECLAIM_CLEAN)
994 goto keep_locked; 994 goto keep_locked;
995 if (!may_enter_fs) 995 if (!may_enter_fs)
996 goto keep_locked; 996 goto keep_locked;
997 if (!sc->may_writepage) 997 if (!sc->may_writepage)
998 goto keep_locked; 998 goto keep_locked;
999 999
1000 /* Page is dirty, try to write it out here */ 1000 /* Page is dirty, try to write it out here */
1001 switch (pageout(page, mapping, sc)) { 1001 switch (pageout(page, mapping, sc)) {
1002 case PAGE_KEEP: 1002 case PAGE_KEEP:
1003 goto keep_locked; 1003 goto keep_locked;
1004 case PAGE_ACTIVATE: 1004 case PAGE_ACTIVATE:
1005 goto activate_locked; 1005 goto activate_locked;
1006 case PAGE_SUCCESS: 1006 case PAGE_SUCCESS:
1007 if (PageWriteback(page)) 1007 if (PageWriteback(page))
1008 goto keep; 1008 goto keep;
1009 if (PageDirty(page)) 1009 if (PageDirty(page))
1010 goto keep; 1010 goto keep;
1011 1011
1012 /* 1012 /*
1013 * A synchronous write - probably a ramdisk. Go 1013 * A synchronous write - probably a ramdisk. Go
1014 * ahead and try to reclaim the page. 1014 * ahead and try to reclaim the page.
1015 */ 1015 */
1016 if (!trylock_page(page)) 1016 if (!trylock_page(page))
1017 goto keep; 1017 goto keep;
1018 if (PageDirty(page) || PageWriteback(page)) 1018 if (PageDirty(page) || PageWriteback(page))
1019 goto keep_locked; 1019 goto keep_locked;
1020 mapping = page_mapping(page); 1020 mapping = page_mapping(page);
1021 case PAGE_CLEAN: 1021 case PAGE_CLEAN:
1022 ; /* try to free the page below */ 1022 ; /* try to free the page below */
1023 } 1023 }
1024 } 1024 }
1025 1025
1026 /* 1026 /*
1027 * If the page has buffers, try to free the buffer mappings 1027 * If the page has buffers, try to free the buffer mappings
1028 * associated with this page. If we succeed we try to free 1028 * associated with this page. If we succeed we try to free
1029 * the page as well. 1029 * the page as well.
1030 * 1030 *
1031 * We do this even if the page is PageDirty(). 1031 * We do this even if the page is PageDirty().
1032 * try_to_release_page() does not perform I/O, but it is 1032 * try_to_release_page() does not perform I/O, but it is
1033 * possible for a page to have PageDirty set, but it is actually 1033 * possible for a page to have PageDirty set, but it is actually
1034 * clean (all its buffers are clean). This happens if the 1034 * clean (all its buffers are clean). This happens if the
1035 * buffers were written out directly, with submit_bh(). ext3 1035 * buffers were written out directly, with submit_bh(). ext3
1036 * will do this, as well as the blockdev mapping. 1036 * will do this, as well as the blockdev mapping.
1037 * try_to_release_page() will discover that cleanness and will 1037 * try_to_release_page() will discover that cleanness and will
1038 * drop the buffers and mark the page clean - it can be freed. 1038 * drop the buffers and mark the page clean - it can be freed.
1039 * 1039 *
1040 * Rarely, pages can have buffers and no ->mapping. These are 1040 * Rarely, pages can have buffers and no ->mapping. These are
1041 * the pages which were not successfully invalidated in 1041 * the pages which were not successfully invalidated in
1042 * truncate_complete_page(). We try to drop those buffers here 1042 * truncate_complete_page(). We try to drop those buffers here
1043 * and if that worked, and the page is no longer mapped into 1043 * and if that worked, and the page is no longer mapped into
1044 * process address space (page_count == 1) it can be freed. 1044 * process address space (page_count == 1) it can be freed.
1045 * Otherwise, leave the page on the LRU so it is swappable. 1045 * Otherwise, leave the page on the LRU so it is swappable.
1046 */ 1046 */
1047 if (page_has_private(page)) { 1047 if (page_has_private(page)) {
1048 if (!try_to_release_page(page, sc->gfp_mask)) 1048 if (!try_to_release_page(page, sc->gfp_mask))
1049 goto activate_locked; 1049 goto activate_locked;
1050 if (!mapping && page_count(page) == 1) { 1050 if (!mapping && page_count(page) == 1) {
1051 unlock_page(page); 1051 unlock_page(page);
1052 if (put_page_testzero(page)) 1052 if (put_page_testzero(page))
1053 goto free_it; 1053 goto free_it;
1054 else { 1054 else {
1055 /* 1055 /*
1056 * rare race with speculative reference. 1056 * rare race with speculative reference.
1057 * the speculative reference will free 1057 * the speculative reference will free
1058 * this page shortly, so we may 1058 * this page shortly, so we may
1059 * increment nr_reclaimed here (and 1059 * increment nr_reclaimed here (and
1060 * leave it off the LRU). 1060 * leave it off the LRU).
1061 */ 1061 */
1062 nr_reclaimed++; 1062 nr_reclaimed++;
1063 continue; 1063 continue;
1064 } 1064 }
1065 } 1065 }
1066 } 1066 }
1067 1067
1068 if (!mapping || !__remove_mapping(mapping, page)) 1068 if (!mapping || !__remove_mapping(mapping, page))
1069 goto keep_locked; 1069 goto keep_locked;
1070 1070
1071 /* 1071 /*
1072 * At this point, we have no other references and there is 1072 * At this point, we have no other references and there is
1073 * no way to pick any more up (removed from LRU, removed 1073 * no way to pick any more up (removed from LRU, removed
1074 * from pagecache). Can use non-atomic bitops now (and 1074 * from pagecache). Can use non-atomic bitops now (and
1075 * we obviously don't have to worry about waking up a process 1075 * we obviously don't have to worry about waking up a process
1076 * waiting on the page lock, because there are no references. 1076 * waiting on the page lock, because there are no references.
1077 */ 1077 */
1078 __clear_page_locked(page); 1078 __clear_page_locked(page);
1079 free_it: 1079 free_it:
1080 nr_reclaimed++; 1080 nr_reclaimed++;
1081 1081
1082 /* 1082 /*
1083 * Is there need to periodically free_page_list? It would 1083 * Is there need to periodically free_page_list? It would
1084 * appear not as the counts should be low 1084 * appear not as the counts should be low
1085 */ 1085 */
1086 list_add(&page->lru, &free_pages); 1086 list_add(&page->lru, &free_pages);
1087 continue; 1087 continue;
1088 1088
1089 cull_mlocked: 1089 cull_mlocked:
1090 if (PageSwapCache(page)) 1090 if (PageSwapCache(page))
1091 try_to_free_swap(page); 1091 try_to_free_swap(page);
1092 unlock_page(page); 1092 unlock_page(page);
1093 putback_lru_page(page); 1093 putback_lru_page(page);
1094 continue; 1094 continue;
1095 1095
1096 activate_locked: 1096 activate_locked:
1097 /* Not a candidate for swapping, so reclaim swap space. */ 1097 /* Not a candidate for swapping, so reclaim swap space. */
1098 if (PageSwapCache(page) && vm_swap_full()) 1098 if (PageSwapCache(page) && vm_swap_full())
1099 try_to_free_swap(page); 1099 try_to_free_swap(page);
1100 VM_BUG_ON(PageActive(page)); 1100 VM_BUG_ON(PageActive(page));
1101 SetPageActive(page); 1101 SetPageActive(page);
1102 pgactivate++; 1102 pgactivate++;
1103 keep_locked: 1103 keep_locked:
1104 unlock_page(page); 1104 unlock_page(page);
1105 keep: 1105 keep:
1106 list_add(&page->lru, &ret_pages); 1106 list_add(&page->lru, &ret_pages);
1107 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1107 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1108 } 1108 }
1109 1109
1110 free_hot_cold_page_list(&free_pages, 1); 1110 free_hot_cold_page_list(&free_pages, 1);
1111 1111
1112 list_splice(&ret_pages, page_list); 1112 list_splice(&ret_pages, page_list);
1113 count_vm_events(PGACTIVATE, pgactivate); 1113 count_vm_events(PGACTIVATE, pgactivate);
1114 mem_cgroup_uncharge_end(); 1114 mem_cgroup_uncharge_end();
1115 *ret_nr_dirty += nr_dirty; 1115 *ret_nr_dirty += nr_dirty;
1116 *ret_nr_congested += nr_congested; 1116 *ret_nr_congested += nr_congested;
1117 *ret_nr_unqueued_dirty += nr_unqueued_dirty; 1117 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
1118 *ret_nr_writeback += nr_writeback; 1118 *ret_nr_writeback += nr_writeback;
1119 *ret_nr_immediate += nr_immediate; 1119 *ret_nr_immediate += nr_immediate;
1120 return nr_reclaimed; 1120 return nr_reclaimed;
1121 } 1121 }
1122 1122
1123 unsigned long reclaim_clean_pages_from_list(struct zone *zone, 1123 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1124 struct list_head *page_list) 1124 struct list_head *page_list)
1125 { 1125 {
1126 struct scan_control sc = { 1126 struct scan_control sc = {
1127 .gfp_mask = GFP_KERNEL, 1127 .gfp_mask = GFP_KERNEL,
1128 .priority = DEF_PRIORITY, 1128 .priority = DEF_PRIORITY,
1129 .may_unmap = 1, 1129 .may_unmap = 1,
1130 }; 1130 };
1131 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; 1131 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
1132 struct page *page, *next; 1132 struct page *page, *next;
1133 LIST_HEAD(clean_pages); 1133 LIST_HEAD(clean_pages);
1134 1134
1135 list_for_each_entry_safe(page, next, page_list, lru) { 1135 list_for_each_entry_safe(page, next, page_list, lru) {
1136 if (page_is_file_cache(page) && !PageDirty(page) && 1136 if (page_is_file_cache(page) && !PageDirty(page) &&
1137 !isolated_balloon_page(page)) { 1137 !isolated_balloon_page(page)) {
1138 ClearPageActive(page); 1138 ClearPageActive(page);
1139 list_move(&page->lru, &clean_pages); 1139 list_move(&page->lru, &clean_pages);
1140 } 1140 }
1141 } 1141 }
1142 1142
1143 ret = shrink_page_list(&clean_pages, zone, &sc, 1143 ret = shrink_page_list(&clean_pages, zone, &sc,
1144 TTU_UNMAP|TTU_IGNORE_ACCESS, 1144 TTU_UNMAP|TTU_IGNORE_ACCESS,
1145 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); 1145 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1146 list_splice(&clean_pages, page_list); 1146 list_splice(&clean_pages, page_list);
1147 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1147 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
1148 return ret; 1148 return ret;
1149 } 1149 }
1150 1150
1151 /* 1151 /*
1152 * Attempt to remove the specified page from its LRU. Only take this page 1152 * Attempt to remove the specified page from its LRU. Only take this page
1153 * if it is of the appropriate PageActive status. Pages which are being 1153 * if it is of the appropriate PageActive status. Pages which are being
1154 * freed elsewhere are also ignored. 1154 * freed elsewhere are also ignored.
1155 * 1155 *
1156 * page: page to consider 1156 * page: page to consider
1157 * mode: one of the LRU isolation modes defined above 1157 * mode: one of the LRU isolation modes defined above
1158 * 1158 *
1159 * returns 0 on success, -ve errno on failure. 1159 * returns 0 on success, -ve errno on failure.
1160 */ 1160 */
1161 int __isolate_lru_page(struct page *page, isolate_mode_t mode) 1161 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1162 { 1162 {
1163 int ret = -EINVAL; 1163 int ret = -EINVAL;
1164 1164
1165 /* Only take pages on the LRU. */ 1165 /* Only take pages on the LRU. */
1166 if (!PageLRU(page)) 1166 if (!PageLRU(page))
1167 return ret; 1167 return ret;
1168 1168
1169 /* Compaction should not handle unevictable pages but CMA can do so */ 1169 /* Compaction should not handle unevictable pages but CMA can do so */
1170 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) 1170 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1171 return ret; 1171 return ret;
1172 1172
1173 ret = -EBUSY; 1173 ret = -EBUSY;
1174 1174
1175 /* 1175 /*
1176 * To minimise LRU disruption, the caller can indicate that it only 1176 * To minimise LRU disruption, the caller can indicate that it only
1177 * wants to isolate pages it will be able to operate on without 1177 * wants to isolate pages it will be able to operate on without
1178 * blocking - clean pages for the most part. 1178 * blocking - clean pages for the most part.
1179 * 1179 *
1180 * ISOLATE_CLEAN means that only clean pages should be isolated. This 1180 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1181 * is used by reclaim when it is cannot write to backing storage 1181 * is used by reclaim when it is cannot write to backing storage
1182 * 1182 *
1183 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages 1183 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1184 * that it is possible to migrate without blocking 1184 * that it is possible to migrate without blocking
1185 */ 1185 */
1186 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { 1186 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1187 /* All the caller can do on PageWriteback is block */ 1187 /* All the caller can do on PageWriteback is block */
1188 if (PageWriteback(page)) 1188 if (PageWriteback(page))
1189 return ret; 1189 return ret;
1190 1190
1191 if (PageDirty(page)) { 1191 if (PageDirty(page)) {
1192 struct address_space *mapping; 1192 struct address_space *mapping;
1193 1193
1194 /* ISOLATE_CLEAN means only clean pages */ 1194 /* ISOLATE_CLEAN means only clean pages */
1195 if (mode & ISOLATE_CLEAN) 1195 if (mode & ISOLATE_CLEAN)
1196 return ret; 1196 return ret;
1197 1197
1198 /* 1198 /*
1199 * Only pages without mappings or that have a 1199 * Only pages without mappings or that have a
1200 * ->migratepage callback are possible to migrate 1200 * ->migratepage callback are possible to migrate
1201 * without blocking 1201 * without blocking
1202 */ 1202 */
1203 mapping = page_mapping(page); 1203 mapping = page_mapping(page);
1204 if (mapping && !mapping->a_ops->migratepage) 1204 if (mapping && !mapping->a_ops->migratepage)
1205 return ret; 1205 return ret;
1206 } 1206 }
1207 } 1207 }
1208 1208
1209 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1209 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1210 return ret; 1210 return ret;
1211 1211
1212 if (likely(get_page_unless_zero(page))) { 1212 if (likely(get_page_unless_zero(page))) {
1213 /* 1213 /*
1214 * Be careful not to clear PageLRU until after we're 1214 * Be careful not to clear PageLRU until after we're
1215 * sure the page is not being freed elsewhere -- the 1215 * sure the page is not being freed elsewhere -- the
1216 * page release code relies on it. 1216 * page release code relies on it.
1217 */ 1217 */
1218 ClearPageLRU(page); 1218 ClearPageLRU(page);
1219 ret = 0; 1219 ret = 0;
1220 } 1220 }
1221 1221
1222 return ret; 1222 return ret;
1223 } 1223 }
1224 1224
1225 /* 1225 /*
1226 * zone->lru_lock is heavily contended. Some of the functions that 1226 * zone->lru_lock is heavily contended. Some of the functions that
1227 * shrink the lists perform better by taking out a batch of pages 1227 * shrink the lists perform better by taking out a batch of pages
1228 * and working on them outside the LRU lock. 1228 * and working on them outside the LRU lock.
1229 * 1229 *
1230 * For pagecache intensive workloads, this function is the hottest 1230 * For pagecache intensive workloads, this function is the hottest
1231 * spot in the kernel (apart from copy_*_user functions). 1231 * spot in the kernel (apart from copy_*_user functions).
1232 * 1232 *
1233 * Appropriate locks must be held before calling this function. 1233 * Appropriate locks must be held before calling this function.
1234 * 1234 *
1235 * @nr_to_scan: The number of pages to look through on the list. 1235 * @nr_to_scan: The number of pages to look through on the list.
1236 * @lruvec: The LRU vector to pull pages from. 1236 * @lruvec: The LRU vector to pull pages from.
1237 * @dst: The temp list to put pages on to. 1237 * @dst: The temp list to put pages on to.
1238 * @nr_scanned: The number of pages that were scanned. 1238 * @nr_scanned: The number of pages that were scanned.
1239 * @sc: The scan_control struct for this reclaim session 1239 * @sc: The scan_control struct for this reclaim session
1240 * @mode: One of the LRU isolation modes 1240 * @mode: One of the LRU isolation modes
1241 * @lru: LRU list id for isolating 1241 * @lru: LRU list id for isolating
1242 * 1242 *
1243 * returns how many pages were moved onto *@dst. 1243 * returns how many pages were moved onto *@dst.
1244 */ 1244 */
1245 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1245 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1246 struct lruvec *lruvec, struct list_head *dst, 1246 struct lruvec *lruvec, struct list_head *dst,
1247 unsigned long *nr_scanned, struct scan_control *sc, 1247 unsigned long *nr_scanned, struct scan_control *sc,
1248 isolate_mode_t mode, enum lru_list lru) 1248 isolate_mode_t mode, enum lru_list lru)
1249 { 1249 {
1250 struct list_head *src = &lruvec->lists[lru]; 1250 struct list_head *src = &lruvec->lists[lru];
1251 unsigned long nr_taken = 0; 1251 unsigned long nr_taken = 0;
1252 unsigned long scan; 1252 unsigned long scan;
1253 1253
1254 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1254 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1255 struct page *page; 1255 struct page *page;
1256 int nr_pages; 1256 int nr_pages;
1257 1257
1258 page = lru_to_page(src); 1258 page = lru_to_page(src);
1259 prefetchw_prev_lru_page(page, src, flags); 1259 prefetchw_prev_lru_page(page, src, flags);
1260 1260
1261 VM_BUG_ON(!PageLRU(page)); 1261 VM_BUG_ON(!PageLRU(page));
1262 1262
1263 switch (__isolate_lru_page(page, mode)) { 1263 switch (__isolate_lru_page(page, mode)) {
1264 case 0: 1264 case 0:
1265 nr_pages = hpage_nr_pages(page); 1265 nr_pages = hpage_nr_pages(page);
1266 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); 1266 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1267 list_move(&page->lru, dst); 1267 list_move(&page->lru, dst);
1268 nr_taken += nr_pages; 1268 nr_taken += nr_pages;
1269 break; 1269 break;
1270 1270
1271 case -EBUSY: 1271 case -EBUSY:
1272 /* else it is being freed elsewhere */ 1272 /* else it is being freed elsewhere */
1273 list_move(&page->lru, src); 1273 list_move(&page->lru, src);
1274 continue; 1274 continue;
1275 1275
1276 default: 1276 default:
1277 BUG(); 1277 BUG();
1278 } 1278 }
1279 } 1279 }
1280 1280
1281 *nr_scanned = scan; 1281 *nr_scanned = scan;
1282 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, 1282 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1283 nr_taken, mode, is_file_lru(lru)); 1283 nr_taken, mode, is_file_lru(lru));
1284 return nr_taken; 1284 return nr_taken;
1285 } 1285 }
1286 1286
1287 /** 1287 /**
1288 * isolate_lru_page - tries to isolate a page from its LRU list 1288 * isolate_lru_page - tries to isolate a page from its LRU list
1289 * @page: page to isolate from its LRU list 1289 * @page: page to isolate from its LRU list
1290 * 1290 *
1291 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1291 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1292 * vmstat statistic corresponding to whatever LRU list the page was on. 1292 * vmstat statistic corresponding to whatever LRU list the page was on.
1293 * 1293 *
1294 * Returns 0 if the page was removed from an LRU list. 1294 * Returns 0 if the page was removed from an LRU list.
1295 * Returns -EBUSY if the page was not on an LRU list. 1295 * Returns -EBUSY if the page was not on an LRU list.
1296 * 1296 *
1297 * The returned page will have PageLRU() cleared. If it was found on 1297 * The returned page will have PageLRU() cleared. If it was found on
1298 * the active list, it will have PageActive set. If it was found on 1298 * the active list, it will have PageActive set. If it was found on
1299 * the unevictable list, it will have the PageUnevictable bit set. That flag 1299 * the unevictable list, it will have the PageUnevictable bit set. That flag
1300 * may need to be cleared by the caller before letting the page go. 1300 * may need to be cleared by the caller before letting the page go.
1301 * 1301 *
1302 * The vmstat statistic corresponding to the list on which the page was 1302 * The vmstat statistic corresponding to the list on which the page was
1303 * found will be decremented. 1303 * found will be decremented.
1304 * 1304 *
1305 * Restrictions: 1305 * Restrictions:
1306 * (1) Must be called with an elevated refcount on the page. This is a 1306 * (1) Must be called with an elevated refcount on the page. This is a
1307 * fundamentnal difference from isolate_lru_pages (which is called 1307 * fundamentnal difference from isolate_lru_pages (which is called
1308 * without a stable reference). 1308 * without a stable reference).
1309 * (2) the lru_lock must not be held. 1309 * (2) the lru_lock must not be held.
1310 * (3) interrupts must be enabled. 1310 * (3) interrupts must be enabled.
1311 */ 1311 */
1312 int isolate_lru_page(struct page *page) 1312 int isolate_lru_page(struct page *page)
1313 { 1313 {
1314 int ret = -EBUSY; 1314 int ret = -EBUSY;
1315 1315
1316 VM_BUG_ON(!page_count(page)); 1316 VM_BUG_ON(!page_count(page));
1317 1317
1318 if (PageLRU(page)) { 1318 if (PageLRU(page)) {
1319 struct zone *zone = page_zone(page); 1319 struct zone *zone = page_zone(page);
1320 struct lruvec *lruvec; 1320 struct lruvec *lruvec;
1321 1321
1322 spin_lock_irq(&zone->lru_lock); 1322 spin_lock_irq(&zone->lru_lock);
1323 lruvec = mem_cgroup_page_lruvec(page, zone); 1323 lruvec = mem_cgroup_page_lruvec(page, zone);
1324 if (PageLRU(page)) { 1324 if (PageLRU(page)) {
1325 int lru = page_lru(page); 1325 int lru = page_lru(page);
1326 get_page(page); 1326 get_page(page);
1327 ClearPageLRU(page); 1327 ClearPageLRU(page);
1328 del_page_from_lru_list(page, lruvec, lru); 1328 del_page_from_lru_list(page, lruvec, lru);
1329 ret = 0; 1329 ret = 0;
1330 } 1330 }
1331 spin_unlock_irq(&zone->lru_lock); 1331 spin_unlock_irq(&zone->lru_lock);
1332 } 1332 }
1333 return ret; 1333 return ret;
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 1337 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1338 * then get resheduled. When there are massive number of tasks doing page 1338 * then get resheduled. When there are massive number of tasks doing page
1339 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 1339 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1340 * the LRU list will go small and be scanned faster than necessary, leading to 1340 * the LRU list will go small and be scanned faster than necessary, leading to
1341 * unnecessary swapping, thrashing and OOM. 1341 * unnecessary swapping, thrashing and OOM.
1342 */ 1342 */
1343 static int too_many_isolated(struct zone *zone, int file, 1343 static int too_many_isolated(struct zone *zone, int file,
1344 struct scan_control *sc) 1344 struct scan_control *sc)
1345 { 1345 {
1346 unsigned long inactive, isolated; 1346 unsigned long inactive, isolated;
1347 1347
1348 if (current_is_kswapd()) 1348 if (current_is_kswapd())
1349 return 0; 1349 return 0;
1350 1350
1351 if (!global_reclaim(sc)) 1351 if (!global_reclaim(sc))
1352 return 0; 1352 return 0;
1353 1353
1354 if (file) { 1354 if (file) {
1355 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1355 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1356 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1356 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1357 } else { 1357 } else {
1358 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1358 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1359 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1359 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1360 } 1360 }
1361 1361
1362 /* 1362 /*
1363 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 1363 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1364 * won't get blocked by normal direct-reclaimers, forming a circular 1364 * won't get blocked by normal direct-reclaimers, forming a circular
1365 * deadlock. 1365 * deadlock.
1366 */ 1366 */
1367 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) 1367 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1368 inactive >>= 3; 1368 inactive >>= 3;
1369 1369
1370 return isolated > inactive; 1370 return isolated > inactive;
1371 } 1371 }
1372 1372
1373 static noinline_for_stack void 1373 static noinline_for_stack void
1374 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1374 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1375 { 1375 {
1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1377 struct zone *zone = lruvec_zone(lruvec); 1377 struct zone *zone = lruvec_zone(lruvec);
1378 LIST_HEAD(pages_to_free); 1378 LIST_HEAD(pages_to_free);
1379 1379
1380 /* 1380 /*
1381 * Put back any unfreeable pages. 1381 * Put back any unfreeable pages.
1382 */ 1382 */
1383 while (!list_empty(page_list)) { 1383 while (!list_empty(page_list)) {
1384 struct page *page = lru_to_page(page_list); 1384 struct page *page = lru_to_page(page_list);
1385 int lru; 1385 int lru;
1386 1386
1387 VM_BUG_ON(PageLRU(page)); 1387 VM_BUG_ON(PageLRU(page));
1388 list_del(&page->lru); 1388 list_del(&page->lru);
1389 if (unlikely(!page_evictable(page))) { 1389 if (unlikely(!page_evictable(page))) {
1390 spin_unlock_irq(&zone->lru_lock); 1390 spin_unlock_irq(&zone->lru_lock);
1391 putback_lru_page(page); 1391 putback_lru_page(page);
1392 spin_lock_irq(&zone->lru_lock); 1392 spin_lock_irq(&zone->lru_lock);
1393 continue; 1393 continue;
1394 } 1394 }
1395 1395
1396 lruvec = mem_cgroup_page_lruvec(page, zone); 1396 lruvec = mem_cgroup_page_lruvec(page, zone);
1397 1397
1398 SetPageLRU(page); 1398 SetPageLRU(page);
1399 lru = page_lru(page); 1399 lru = page_lru(page);
1400 add_page_to_lru_list(page, lruvec, lru); 1400 add_page_to_lru_list(page, lruvec, lru);
1401 1401
1402 if (is_active_lru(lru)) { 1402 if (is_active_lru(lru)) {
1403 int file = is_file_lru(lru); 1403 int file = is_file_lru(lru);
1404 int numpages = hpage_nr_pages(page); 1404 int numpages = hpage_nr_pages(page);
1405 reclaim_stat->recent_rotated[file] += numpages; 1405 reclaim_stat->recent_rotated[file] += numpages;
1406 } 1406 }
1407 if (put_page_testzero(page)) { 1407 if (put_page_testzero(page)) {
1408 __ClearPageLRU(page); 1408 __ClearPageLRU(page);
1409 __ClearPageActive(page); 1409 __ClearPageActive(page);
1410 del_page_from_lru_list(page, lruvec, lru); 1410 del_page_from_lru_list(page, lruvec, lru);
1411 1411
1412 if (unlikely(PageCompound(page))) { 1412 if (unlikely(PageCompound(page))) {
1413 spin_unlock_irq(&zone->lru_lock); 1413 spin_unlock_irq(&zone->lru_lock);
1414 (*get_compound_page_dtor(page))(page); 1414 (*get_compound_page_dtor(page))(page);
1415 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1416 } else 1416 } else
1417 list_add(&page->lru, &pages_to_free); 1417 list_add(&page->lru, &pages_to_free);
1418 } 1418 }
1419 } 1419 }
1420 1420
1421 /* 1421 /*
1422 * To save our caller's stack, now use input list for pages to free. 1422 * To save our caller's stack, now use input list for pages to free.
1423 */ 1423 */
1424 list_splice(&pages_to_free, page_list); 1424 list_splice(&pages_to_free, page_list);
1425 } 1425 }
1426 1426
1427 /* 1427 /*
1428 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1428 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1429 * of reclaimed pages 1429 * of reclaimed pages
1430 */ 1430 */
1431 static noinline_for_stack unsigned long 1431 static noinline_for_stack unsigned long
1432 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, 1432 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1433 struct scan_control *sc, enum lru_list lru) 1433 struct scan_control *sc, enum lru_list lru)
1434 { 1434 {
1435 LIST_HEAD(page_list); 1435 LIST_HEAD(page_list);
1436 unsigned long nr_scanned; 1436 unsigned long nr_scanned;
1437 unsigned long nr_reclaimed = 0; 1437 unsigned long nr_reclaimed = 0;
1438 unsigned long nr_taken; 1438 unsigned long nr_taken;
1439 unsigned long nr_dirty = 0; 1439 unsigned long nr_dirty = 0;
1440 unsigned long nr_congested = 0; 1440 unsigned long nr_congested = 0;
1441 unsigned long nr_unqueued_dirty = 0; 1441 unsigned long nr_unqueued_dirty = 0;
1442 unsigned long nr_writeback = 0; 1442 unsigned long nr_writeback = 0;
1443 unsigned long nr_immediate = 0; 1443 unsigned long nr_immediate = 0;
1444 isolate_mode_t isolate_mode = 0; 1444 isolate_mode_t isolate_mode = 0;
1445 int file = is_file_lru(lru); 1445 int file = is_file_lru(lru);
1446 struct zone *zone = lruvec_zone(lruvec); 1446 struct zone *zone = lruvec_zone(lruvec);
1447 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1447 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1448 1448
1449 while (unlikely(too_many_isolated(zone, file, sc))) { 1449 while (unlikely(too_many_isolated(zone, file, sc))) {
1450 congestion_wait(BLK_RW_ASYNC, HZ/10); 1450 congestion_wait(BLK_RW_ASYNC, HZ/10);
1451 1451
1452 /* We are about to die and free our memory. Return now. */ 1452 /* We are about to die and free our memory. Return now. */
1453 if (fatal_signal_pending(current)) 1453 if (fatal_signal_pending(current))
1454 return SWAP_CLUSTER_MAX; 1454 return SWAP_CLUSTER_MAX;
1455 } 1455 }
1456 1456
1457 lru_add_drain(); 1457 lru_add_drain();
1458 1458
1459 if (!sc->may_unmap) 1459 if (!sc->may_unmap)
1460 isolate_mode |= ISOLATE_UNMAPPED; 1460 isolate_mode |= ISOLATE_UNMAPPED;
1461 if (!sc->may_writepage) 1461 if (!sc->may_writepage)
1462 isolate_mode |= ISOLATE_CLEAN; 1462 isolate_mode |= ISOLATE_CLEAN;
1463 1463
1464 spin_lock_irq(&zone->lru_lock); 1464 spin_lock_irq(&zone->lru_lock);
1465 1465
1466 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1466 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1467 &nr_scanned, sc, isolate_mode, lru); 1467 &nr_scanned, sc, isolate_mode, lru);
1468 1468
1469 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1469 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1470 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1470 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1471 1471
1472 if (global_reclaim(sc)) { 1472 if (global_reclaim(sc)) {
1473 zone->pages_scanned += nr_scanned; 1473 zone->pages_scanned += nr_scanned;
1474 if (current_is_kswapd()) 1474 if (current_is_kswapd())
1475 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1475 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1476 else 1476 else
1477 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); 1477 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1478 } 1478 }
1479 spin_unlock_irq(&zone->lru_lock); 1479 spin_unlock_irq(&zone->lru_lock);
1480 1480
1481 if (nr_taken == 0) 1481 if (nr_taken == 0)
1482 return 0; 1482 return 0;
1483 1483
1484 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1484 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1485 &nr_dirty, &nr_unqueued_dirty, &nr_congested, 1485 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1486 &nr_writeback, &nr_immediate, 1486 &nr_writeback, &nr_immediate,
1487 false); 1487 false);
1488 1488
1489 spin_lock_irq(&zone->lru_lock); 1489 spin_lock_irq(&zone->lru_lock);
1490 1490
1491 reclaim_stat->recent_scanned[file] += nr_taken; 1491 reclaim_stat->recent_scanned[file] += nr_taken;
1492 1492
1493 if (global_reclaim(sc)) { 1493 if (global_reclaim(sc)) {
1494 if (current_is_kswapd()) 1494 if (current_is_kswapd())
1495 __count_zone_vm_events(PGSTEAL_KSWAPD, zone, 1495 __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1496 nr_reclaimed); 1496 nr_reclaimed);
1497 else 1497 else
1498 __count_zone_vm_events(PGSTEAL_DIRECT, zone, 1498 __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1499 nr_reclaimed); 1499 nr_reclaimed);
1500 } 1500 }
1501 1501
1502 putback_inactive_pages(lruvec, &page_list); 1502 putback_inactive_pages(lruvec, &page_list);
1503 1503
1504 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1504 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1505 1505
1506 spin_unlock_irq(&zone->lru_lock); 1506 spin_unlock_irq(&zone->lru_lock);
1507 1507
1508 free_hot_cold_page_list(&page_list, 1); 1508 free_hot_cold_page_list(&page_list, 1);
1509 1509
1510 /* 1510 /*
1511 * If reclaim is isolating dirty pages under writeback, it implies 1511 * If reclaim is isolating dirty pages under writeback, it implies
1512 * that the long-lived page allocation rate is exceeding the page 1512 * that the long-lived page allocation rate is exceeding the page
1513 * laundering rate. Either the global limits are not being effective 1513 * laundering rate. Either the global limits are not being effective
1514 * at throttling processes due to the page distribution throughout 1514 * at throttling processes due to the page distribution throughout
1515 * zones or there is heavy usage of a slow backing device. The 1515 * zones or there is heavy usage of a slow backing device. The
1516 * only option is to throttle from reclaim context which is not ideal 1516 * only option is to throttle from reclaim context which is not ideal
1517 * as there is no guarantee the dirtying process is throttled in the 1517 * as there is no guarantee the dirtying process is throttled in the
1518 * same way balance_dirty_pages() manages. 1518 * same way balance_dirty_pages() manages.
1519 * 1519 *
1520 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number 1520 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1521 * of pages under pages flagged for immediate reclaim and stall if any 1521 * of pages under pages flagged for immediate reclaim and stall if any
1522 * are encountered in the nr_immediate check below. 1522 * are encountered in the nr_immediate check below.
1523 */ 1523 */
1524 if (nr_writeback && nr_writeback == nr_taken) 1524 if (nr_writeback && nr_writeback == nr_taken)
1525 zone_set_flag(zone, ZONE_WRITEBACK); 1525 zone_set_flag(zone, ZONE_WRITEBACK);
1526 1526
1527 /* 1527 /*
1528 * memcg will stall in page writeback so only consider forcibly 1528 * memcg will stall in page writeback so only consider forcibly
1529 * stalling for global reclaim 1529 * stalling for global reclaim
1530 */ 1530 */
1531 if (global_reclaim(sc)) { 1531 if (global_reclaim(sc)) {
1532 /* 1532 /*
1533 * Tag a zone as congested if all the dirty pages scanned were 1533 * Tag a zone as congested if all the dirty pages scanned were
1534 * backed by a congested BDI and wait_iff_congested will stall. 1534 * backed by a congested BDI and wait_iff_congested will stall.
1535 */ 1535 */
1536 if (nr_dirty && nr_dirty == nr_congested) 1536 if (nr_dirty && nr_dirty == nr_congested)
1537 zone_set_flag(zone, ZONE_CONGESTED); 1537 zone_set_flag(zone, ZONE_CONGESTED);
1538 1538
1539 /* 1539 /*
1540 * If dirty pages are scanned that are not queued for IO, it 1540 * If dirty pages are scanned that are not queued for IO, it
1541 * implies that flushers are not keeping up. In this case, flag 1541 * implies that flushers are not keeping up. In this case, flag
1542 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing 1542 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1543 * pages from reclaim context. 1543 * pages from reclaim context.
1544 */ 1544 */
1545 if (nr_unqueued_dirty == nr_taken) 1545 if (nr_unqueued_dirty == nr_taken)
1546 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); 1546 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1547 1547
1548 /* 1548 /*
1549 * If kswapd scans pages marked marked for immediate 1549 * If kswapd scans pages marked marked for immediate
1550 * reclaim and under writeback (nr_immediate), it implies 1550 * reclaim and under writeback (nr_immediate), it implies
1551 * that pages are cycling through the LRU faster than 1551 * that pages are cycling through the LRU faster than
1552 * they are written so also forcibly stall. 1552 * they are written so also forcibly stall.
1553 */ 1553 */
1554 if (nr_immediate) 1554 if (nr_immediate)
1555 congestion_wait(BLK_RW_ASYNC, HZ/10); 1555 congestion_wait(BLK_RW_ASYNC, HZ/10);
1556 } 1556 }
1557 1557
1558 /* 1558 /*
1559 * Stall direct reclaim for IO completions if underlying BDIs or zone 1559 * Stall direct reclaim for IO completions if underlying BDIs or zone
1560 * is congested. Allow kswapd to continue until it starts encountering 1560 * is congested. Allow kswapd to continue until it starts encountering
1561 * unqueued dirty pages or cycling through the LRU too quickly. 1561 * unqueued dirty pages or cycling through the LRU too quickly.
1562 */ 1562 */
1563 if (!sc->hibernation_mode && !current_is_kswapd()) 1563 if (!sc->hibernation_mode && !current_is_kswapd())
1564 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1564 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1565 1565
1566 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1566 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1567 zone_idx(zone), 1567 zone_idx(zone),
1568 nr_scanned, nr_reclaimed, 1568 nr_scanned, nr_reclaimed,
1569 sc->priority, 1569 sc->priority,
1570 trace_shrink_flags(file)); 1570 trace_shrink_flags(file));
1571 return nr_reclaimed; 1571 return nr_reclaimed;
1572 } 1572 }
1573 1573
1574 /* 1574 /*
1575 * This moves pages from the active list to the inactive list. 1575 * This moves pages from the active list to the inactive list.
1576 * 1576 *
1577 * We move them the other way if the page is referenced by one or more 1577 * We move them the other way if the page is referenced by one or more
1578 * processes, from rmap. 1578 * processes, from rmap.
1579 * 1579 *
1580 * If the pages are mostly unmapped, the processing is fast and it is 1580 * If the pages are mostly unmapped, the processing is fast and it is
1581 * appropriate to hold zone->lru_lock across the whole operation. But if 1581 * appropriate to hold zone->lru_lock across the whole operation. But if
1582 * the pages are mapped, the processing is slow (page_referenced()) so we 1582 * the pages are mapped, the processing is slow (page_referenced()) so we
1583 * should drop zone->lru_lock around each page. It's impossible to balance 1583 * should drop zone->lru_lock around each page. It's impossible to balance
1584 * this, so instead we remove the pages from the LRU while processing them. 1584 * this, so instead we remove the pages from the LRU while processing them.
1585 * It is safe to rely on PG_active against the non-LRU pages in here because 1585 * It is safe to rely on PG_active against the non-LRU pages in here because
1586 * nobody will play with that bit on a non-LRU page. 1586 * nobody will play with that bit on a non-LRU page.
1587 * 1587 *
1588 * The downside is that we have to touch page->_count against each page. 1588 * The downside is that we have to touch page->_count against each page.
1589 * But we had to alter page->flags anyway. 1589 * But we had to alter page->flags anyway.
1590 */ 1590 */
1591 1591
1592 static void move_active_pages_to_lru(struct lruvec *lruvec, 1592 static void move_active_pages_to_lru(struct lruvec *lruvec,
1593 struct list_head *list, 1593 struct list_head *list,
1594 struct list_head *pages_to_free, 1594 struct list_head *pages_to_free,
1595 enum lru_list lru) 1595 enum lru_list lru)
1596 { 1596 {
1597 struct zone *zone = lruvec_zone(lruvec); 1597 struct zone *zone = lruvec_zone(lruvec);
1598 unsigned long pgmoved = 0; 1598 unsigned long pgmoved = 0;
1599 struct page *page; 1599 struct page *page;
1600 int nr_pages; 1600 int nr_pages;
1601 1601
1602 while (!list_empty(list)) { 1602 while (!list_empty(list)) {
1603 page = lru_to_page(list); 1603 page = lru_to_page(list);
1604 lruvec = mem_cgroup_page_lruvec(page, zone); 1604 lruvec = mem_cgroup_page_lruvec(page, zone);
1605 1605
1606 VM_BUG_ON(PageLRU(page)); 1606 VM_BUG_ON(PageLRU(page));
1607 SetPageLRU(page); 1607 SetPageLRU(page);
1608 1608
1609 nr_pages = hpage_nr_pages(page); 1609 nr_pages = hpage_nr_pages(page);
1610 mem_cgroup_update_lru_size(lruvec, lru, nr_pages); 1610 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1611 list_move(&page->lru, &lruvec->lists[lru]); 1611 list_move(&page->lru, &lruvec->lists[lru]);
1612 pgmoved += nr_pages; 1612 pgmoved += nr_pages;
1613 1613
1614 if (put_page_testzero(page)) { 1614 if (put_page_testzero(page)) {
1615 __ClearPageLRU(page); 1615 __ClearPageLRU(page);
1616 __ClearPageActive(page); 1616 __ClearPageActive(page);
1617 del_page_from_lru_list(page, lruvec, lru); 1617 del_page_from_lru_list(page, lruvec, lru);
1618 1618
1619 if (unlikely(PageCompound(page))) { 1619 if (unlikely(PageCompound(page))) {
1620 spin_unlock_irq(&zone->lru_lock); 1620 spin_unlock_irq(&zone->lru_lock);
1621 (*get_compound_page_dtor(page))(page); 1621 (*get_compound_page_dtor(page))(page);
1622 spin_lock_irq(&zone->lru_lock); 1622 spin_lock_irq(&zone->lru_lock);
1623 } else 1623 } else
1624 list_add(&page->lru, pages_to_free); 1624 list_add(&page->lru, pages_to_free);
1625 } 1625 }
1626 } 1626 }
1627 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1627 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1628 if (!is_active_lru(lru)) 1628 if (!is_active_lru(lru))
1629 __count_vm_events(PGDEACTIVATE, pgmoved); 1629 __count_vm_events(PGDEACTIVATE, pgmoved);
1630 } 1630 }
1631 1631
1632 static void shrink_active_list(unsigned long nr_to_scan, 1632 static void shrink_active_list(unsigned long nr_to_scan,
1633 struct lruvec *lruvec, 1633 struct lruvec *lruvec,
1634 struct scan_control *sc, 1634 struct scan_control *sc,
1635 enum lru_list lru) 1635 enum lru_list lru)
1636 { 1636 {
1637 unsigned long nr_taken; 1637 unsigned long nr_taken;
1638 unsigned long nr_scanned; 1638 unsigned long nr_scanned;
1639 unsigned long vm_flags; 1639 unsigned long vm_flags;
1640 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1640 LIST_HEAD(l_hold); /* The pages which were snipped off */
1641 LIST_HEAD(l_active); 1641 LIST_HEAD(l_active);
1642 LIST_HEAD(l_inactive); 1642 LIST_HEAD(l_inactive);
1643 struct page *page; 1643 struct page *page;
1644 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1644 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1645 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t isolate_mode = 0; 1646 isolate_mode_t isolate_mode = 0;
1647 int file = is_file_lru(lru); 1647 int file = is_file_lru(lru);
1648 struct zone *zone = lruvec_zone(lruvec); 1648 struct zone *zone = lruvec_zone(lruvec);
1649 1649
1650 lru_add_drain(); 1650 lru_add_drain();
1651 1651
1652 if (!sc->may_unmap) 1652 if (!sc->may_unmap)
1653 isolate_mode |= ISOLATE_UNMAPPED; 1653 isolate_mode |= ISOLATE_UNMAPPED;
1654 if (!sc->may_writepage) 1654 if (!sc->may_writepage)
1655 isolate_mode |= ISOLATE_CLEAN; 1655 isolate_mode |= ISOLATE_CLEAN;
1656 1656
1657 spin_lock_irq(&zone->lru_lock); 1657 spin_lock_irq(&zone->lru_lock);
1658 1658
1659 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1659 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1660 &nr_scanned, sc, isolate_mode, lru); 1660 &nr_scanned, sc, isolate_mode, lru);
1661 if (global_reclaim(sc)) 1661 if (global_reclaim(sc))
1662 zone->pages_scanned += nr_scanned; 1662 zone->pages_scanned += nr_scanned;
1663 1663
1664 reclaim_stat->recent_scanned[file] += nr_taken; 1664 reclaim_stat->recent_scanned[file] += nr_taken;
1665 1665
1666 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1666 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1667 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1667 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1668 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1668 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1669 spin_unlock_irq(&zone->lru_lock); 1669 spin_unlock_irq(&zone->lru_lock);
1670 1670
1671 while (!list_empty(&l_hold)) { 1671 while (!list_empty(&l_hold)) {
1672 cond_resched(); 1672 cond_resched();
1673 page = lru_to_page(&l_hold); 1673 page = lru_to_page(&l_hold);
1674 list_del(&page->lru); 1674 list_del(&page->lru);
1675 1675
1676 if (unlikely(!page_evictable(page))) { 1676 if (unlikely(!page_evictable(page))) {
1677 putback_lru_page(page); 1677 putback_lru_page(page);
1678 continue; 1678 continue;
1679 } 1679 }
1680 1680
1681 if (unlikely(buffer_heads_over_limit)) { 1681 if (unlikely(buffer_heads_over_limit)) {
1682 if (page_has_private(page) && trylock_page(page)) { 1682 if (page_has_private(page) && trylock_page(page)) {
1683 if (page_has_private(page)) 1683 if (page_has_private(page))
1684 try_to_release_page(page, 0); 1684 try_to_release_page(page, 0);
1685 unlock_page(page); 1685 unlock_page(page);
1686 } 1686 }
1687 } 1687 }
1688 1688
1689 if (page_referenced(page, 0, sc->target_mem_cgroup, 1689 if (page_referenced(page, 0, sc->target_mem_cgroup,
1690 &vm_flags)) { 1690 &vm_flags)) {
1691 nr_rotated += hpage_nr_pages(page); 1691 nr_rotated += hpage_nr_pages(page);
1692 /* 1692 /*
1693 * Identify referenced, file-backed active pages and 1693 * Identify referenced, file-backed active pages and
1694 * give them one more trip around the active list. So 1694 * give them one more trip around the active list. So
1695 * that executable code get better chances to stay in 1695 * that executable code get better chances to stay in
1696 * memory under moderate memory pressure. Anon pages 1696 * memory under moderate memory pressure. Anon pages
1697 * are not likely to be evicted by use-once streaming 1697 * are not likely to be evicted by use-once streaming
1698 * IO, plus JVM can create lots of anon VM_EXEC pages, 1698 * IO, plus JVM can create lots of anon VM_EXEC pages,
1699 * so we ignore them here. 1699 * so we ignore them here.
1700 */ 1700 */
1701 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1701 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1702 list_add(&page->lru, &l_active); 1702 list_add(&page->lru, &l_active);
1703 continue; 1703 continue;
1704 } 1704 }
1705 } 1705 }
1706 1706
1707 ClearPageActive(page); /* we are de-activating */ 1707 ClearPageActive(page); /* we are de-activating */
1708 list_add(&page->lru, &l_inactive); 1708 list_add(&page->lru, &l_inactive);
1709 } 1709 }
1710 1710
1711 /* 1711 /*
1712 * Move pages back to the lru list. 1712 * Move pages back to the lru list.
1713 */ 1713 */
1714 spin_lock_irq(&zone->lru_lock); 1714 spin_lock_irq(&zone->lru_lock);
1715 /* 1715 /*
1716 * Count referenced pages from currently used mappings as rotated, 1716 * Count referenced pages from currently used mappings as rotated,
1717 * even though only some of them are actually re-activated. This 1717 * even though only some of them are actually re-activated. This
1718 * helps balance scan pressure between file and anonymous pages in 1718 * helps balance scan pressure between file and anonymous pages in
1719 * get_scan_ratio. 1719 * get_scan_ratio.
1720 */ 1720 */
1721 reclaim_stat->recent_rotated[file] += nr_rotated; 1721 reclaim_stat->recent_rotated[file] += nr_rotated;
1722 1722
1723 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 1723 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1724 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 1724 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1725 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1725 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1726 spin_unlock_irq(&zone->lru_lock); 1726 spin_unlock_irq(&zone->lru_lock);
1727 1727
1728 free_hot_cold_page_list(&l_hold, 1); 1728 free_hot_cold_page_list(&l_hold, 1);
1729 } 1729 }
1730 1730
1731 #ifdef CONFIG_SWAP 1731 #ifdef CONFIG_SWAP
1732 static int inactive_anon_is_low_global(struct zone *zone) 1732 static int inactive_anon_is_low_global(struct zone *zone)
1733 { 1733 {
1734 unsigned long active, inactive; 1734 unsigned long active, inactive;
1735 1735
1736 active = zone_page_state(zone, NR_ACTIVE_ANON); 1736 active = zone_page_state(zone, NR_ACTIVE_ANON);
1737 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1737 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1738 1738
1739 if (inactive * zone->inactive_ratio < active) 1739 if (inactive * zone->inactive_ratio < active)
1740 return 1; 1740 return 1;
1741 1741
1742 return 0; 1742 return 0;
1743 } 1743 }
1744 1744
1745 /** 1745 /**
1746 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1746 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1747 * @lruvec: LRU vector to check 1747 * @lruvec: LRU vector to check
1748 * 1748 *
1749 * Returns true if the zone does not have enough inactive anon pages, 1749 * Returns true if the zone does not have enough inactive anon pages,
1750 * meaning some active anon pages need to be deactivated. 1750 * meaning some active anon pages need to be deactivated.
1751 */ 1751 */
1752 static int inactive_anon_is_low(struct lruvec *lruvec) 1752 static int inactive_anon_is_low(struct lruvec *lruvec)
1753 { 1753 {
1754 /* 1754 /*
1755 * If we don't have swap space, anonymous page deactivation 1755 * If we don't have swap space, anonymous page deactivation
1756 * is pointless. 1756 * is pointless.
1757 */ 1757 */
1758 if (!total_swap_pages) 1758 if (!total_swap_pages)
1759 return 0; 1759 return 0;
1760 1760
1761 if (!mem_cgroup_disabled()) 1761 if (!mem_cgroup_disabled())
1762 return mem_cgroup_inactive_anon_is_low(lruvec); 1762 return mem_cgroup_inactive_anon_is_low(lruvec);
1763 1763
1764 return inactive_anon_is_low_global(lruvec_zone(lruvec)); 1764 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1765 } 1765 }
1766 #else 1766 #else
1767 static inline int inactive_anon_is_low(struct lruvec *lruvec) 1767 static inline int inactive_anon_is_low(struct lruvec *lruvec)
1768 { 1768 {
1769 return 0; 1769 return 0;
1770 } 1770 }
1771 #endif 1771 #endif
1772 1772
1773 /** 1773 /**
1774 * inactive_file_is_low - check if file pages need to be deactivated 1774 * inactive_file_is_low - check if file pages need to be deactivated
1775 * @lruvec: LRU vector to check 1775 * @lruvec: LRU vector to check
1776 * 1776 *
1777 * When the system is doing streaming IO, memory pressure here 1777 * When the system is doing streaming IO, memory pressure here
1778 * ensures that active file pages get deactivated, until more 1778 * ensures that active file pages get deactivated, until more
1779 * than half of the file pages are on the inactive list. 1779 * than half of the file pages are on the inactive list.
1780 * 1780 *
1781 * Once we get to that situation, protect the system's working 1781 * Once we get to that situation, protect the system's working
1782 * set from being evicted by disabling active file page aging. 1782 * set from being evicted by disabling active file page aging.
1783 * 1783 *
1784 * This uses a different ratio than the anonymous pages, because 1784 * This uses a different ratio than the anonymous pages, because
1785 * the page cache uses a use-once replacement algorithm. 1785 * the page cache uses a use-once replacement algorithm.
1786 */ 1786 */
1787 static int inactive_file_is_low(struct lruvec *lruvec) 1787 static int inactive_file_is_low(struct lruvec *lruvec)
1788 { 1788 {
1789 unsigned long inactive; 1789 unsigned long inactive;
1790 unsigned long active; 1790 unsigned long active;
1791 1791
1792 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1792 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1793 active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1793 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1794 1794
1795 return active > inactive; 1795 return active > inactive;
1796 } 1796 }
1797 1797
1798 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1798 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1799 { 1799 {
1800 if (is_file_lru(lru)) 1800 if (is_file_lru(lru))
1801 return inactive_file_is_low(lruvec); 1801 return inactive_file_is_low(lruvec);
1802 else 1802 else
1803 return inactive_anon_is_low(lruvec); 1803 return inactive_anon_is_low(lruvec);
1804 } 1804 }
1805 1805
1806 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1806 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1807 struct lruvec *lruvec, struct scan_control *sc) 1807 struct lruvec *lruvec, struct scan_control *sc)
1808 { 1808 {
1809 if (is_active_lru(lru)) { 1809 if (is_active_lru(lru)) {
1810 if (inactive_list_is_low(lruvec, lru)) 1810 if (inactive_list_is_low(lruvec, lru))
1811 shrink_active_list(nr_to_scan, lruvec, sc, lru); 1811 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1812 return 0; 1812 return 0;
1813 } 1813 }
1814 1814
1815 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 1815 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1816 } 1816 }
1817 1817
1818 static int vmscan_swappiness(struct scan_control *sc) 1818 static int vmscan_swappiness(struct scan_control *sc)
1819 { 1819 {
1820 if (global_reclaim(sc)) 1820 if (global_reclaim(sc))
1821 return vm_swappiness; 1821 return vm_swappiness;
1822 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1822 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1823 } 1823 }
1824 1824
1825 enum scan_balance { 1825 enum scan_balance {
1826 SCAN_EQUAL, 1826 SCAN_EQUAL,
1827 SCAN_FRACT, 1827 SCAN_FRACT,
1828 SCAN_ANON, 1828 SCAN_ANON,
1829 SCAN_FILE, 1829 SCAN_FILE,
1830 }; 1830 };
1831 1831
1832 /* 1832 /*
1833 * Determine how aggressively the anon and file LRU lists should be 1833 * Determine how aggressively the anon and file LRU lists should be
1834 * scanned. The relative value of each set of LRU lists is determined 1834 * scanned. The relative value of each set of LRU lists is determined
1835 * by looking at the fraction of the pages scanned we did rotate back 1835 * by looking at the fraction of the pages scanned we did rotate back
1836 * onto the active list instead of evict. 1836 * onto the active list instead of evict.
1837 * 1837 *
1838 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 1838 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1839 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1839 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1840 */ 1840 */
1841 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1841 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1842 unsigned long *nr) 1842 unsigned long *nr)
1843 { 1843 {
1844 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1844 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1845 u64 fraction[2]; 1845 u64 fraction[2];
1846 u64 denominator = 0; /* gcc */ 1846 u64 denominator = 0; /* gcc */
1847 struct zone *zone = lruvec_zone(lruvec); 1847 struct zone *zone = lruvec_zone(lruvec);
1848 unsigned long anon_prio, file_prio; 1848 unsigned long anon_prio, file_prio;
1849 enum scan_balance scan_balance; 1849 enum scan_balance scan_balance;
1850 unsigned long anon, file, free; 1850 unsigned long anon, file, free;
1851 bool force_scan = false; 1851 bool force_scan = false;
1852 unsigned long ap, fp; 1852 unsigned long ap, fp;
1853 enum lru_list lru; 1853 enum lru_list lru;
1854 1854
1855 /* 1855 /*
1856 * If the zone or memcg is small, nr[l] can be 0. This 1856 * If the zone or memcg is small, nr[l] can be 0. This
1857 * results in no scanning on this priority and a potential 1857 * results in no scanning on this priority and a potential
1858 * priority drop. Global direct reclaim can go to the next 1858 * priority drop. Global direct reclaim can go to the next
1859 * zone and tends to have no problems. Global kswapd is for 1859 * zone and tends to have no problems. Global kswapd is for
1860 * zone balancing and it needs to scan a minimum amount. When 1860 * zone balancing and it needs to scan a minimum amount. When
1861 * reclaiming for a memcg, a priority drop can cause high 1861 * reclaiming for a memcg, a priority drop can cause high
1862 * latencies, so it's better to scan a minimum amount there as 1862 * latencies, so it's better to scan a minimum amount there as
1863 * well. 1863 * well.
1864 */ 1864 */
1865 if (current_is_kswapd() && !zone_reclaimable(zone)) 1865 if (current_is_kswapd() && !zone_reclaimable(zone))
1866 force_scan = true; 1866 force_scan = true;
1867 if (!global_reclaim(sc)) 1867 if (!global_reclaim(sc))
1868 force_scan = true; 1868 force_scan = true;
1869 1869
1870 /* If we have no swap space, do not bother scanning anon pages. */ 1870 /* If we have no swap space, do not bother scanning anon pages. */
1871 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { 1871 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1872 scan_balance = SCAN_FILE; 1872 scan_balance = SCAN_FILE;
1873 goto out; 1873 goto out;
1874 } 1874 }
1875 1875
1876 /* 1876 /*
1877 * Global reclaim will swap to prevent OOM even with no 1877 * Global reclaim will swap to prevent OOM even with no
1878 * swappiness, but memcg users want to use this knob to 1878 * swappiness, but memcg users want to use this knob to
1879 * disable swapping for individual groups completely when 1879 * disable swapping for individual groups completely when
1880 * using the memory controller's swap limit feature would be 1880 * using the memory controller's swap limit feature would be
1881 * too expensive. 1881 * too expensive.
1882 */ 1882 */
1883 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { 1883 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1884 scan_balance = SCAN_FILE; 1884 scan_balance = SCAN_FILE;
1885 goto out; 1885 goto out;
1886 } 1886 }
1887 1887
1888 /* 1888 /*
1889 * Do not apply any pressure balancing cleverness when the 1889 * Do not apply any pressure balancing cleverness when the
1890 * system is close to OOM, scan both anon and file equally 1890 * system is close to OOM, scan both anon and file equally
1891 * (unless the swappiness setting disagrees with swapping). 1891 * (unless the swappiness setting disagrees with swapping).
1892 */ 1892 */
1893 if (!sc->priority && vmscan_swappiness(sc)) { 1893 if (!sc->priority && vmscan_swappiness(sc)) {
1894 scan_balance = SCAN_EQUAL; 1894 scan_balance = SCAN_EQUAL;
1895 goto out; 1895 goto out;
1896 } 1896 }
1897 1897
1898 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + 1898 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1899 get_lru_size(lruvec, LRU_INACTIVE_ANON); 1899 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1900 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1900 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1901 get_lru_size(lruvec, LRU_INACTIVE_FILE); 1901 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1902 1902
1903 /* 1903 /*
1904 * If it's foreseeable that reclaiming the file cache won't be 1904 * If it's foreseeable that reclaiming the file cache won't be
1905 * enough to get the zone back into a desirable shape, we have 1905 * enough to get the zone back into a desirable shape, we have
1906 * to swap. Better start now and leave the - probably heavily 1906 * to swap. Better start now and leave the - probably heavily
1907 * thrashing - remaining file pages alone. 1907 * thrashing - remaining file pages alone.
1908 */ 1908 */
1909 if (global_reclaim(sc)) { 1909 if (global_reclaim(sc)) {
1910 free = zone_page_state(zone, NR_FREE_PAGES); 1910 free = zone_page_state(zone, NR_FREE_PAGES);
1911 if (unlikely(file + free <= high_wmark_pages(zone))) { 1911 if (unlikely(file + free <= high_wmark_pages(zone))) {
1912 scan_balance = SCAN_ANON; 1912 scan_balance = SCAN_ANON;
1913 goto out; 1913 goto out;
1914 } 1914 }
1915 } 1915 }
1916 1916
1917 /* 1917 /*
1918 * There is enough inactive page cache, do not reclaim 1918 * There is enough inactive page cache, do not reclaim
1919 * anything from the anonymous working set right now. 1919 * anything from the anonymous working set right now.
1920 */ 1920 */
1921 if (!inactive_file_is_low(lruvec)) { 1921 if (!inactive_file_is_low(lruvec)) {
1922 scan_balance = SCAN_FILE; 1922 scan_balance = SCAN_FILE;
1923 goto out; 1923 goto out;
1924 } 1924 }
1925 1925
1926 scan_balance = SCAN_FRACT; 1926 scan_balance = SCAN_FRACT;
1927 1927
1928 /* 1928 /*
1929 * With swappiness at 100, anonymous and file have the same priority. 1929 * With swappiness at 100, anonymous and file have the same priority.
1930 * This scanning priority is essentially the inverse of IO cost. 1930 * This scanning priority is essentially the inverse of IO cost.
1931 */ 1931 */
1932 anon_prio = vmscan_swappiness(sc); 1932 anon_prio = vmscan_swappiness(sc);
1933 file_prio = 200 - anon_prio; 1933 file_prio = 200 - anon_prio;
1934 1934
1935 /* 1935 /*
1936 * OK, so we have swap space and a fair amount of page cache 1936 * OK, so we have swap space and a fair amount of page cache
1937 * pages. We use the recently rotated / recently scanned 1937 * pages. We use the recently rotated / recently scanned
1938 * ratios to determine how valuable each cache is. 1938 * ratios to determine how valuable each cache is.
1939 * 1939 *
1940 * Because workloads change over time (and to avoid overflow) 1940 * Because workloads change over time (and to avoid overflow)
1941 * we keep these statistics as a floating average, which ends 1941 * we keep these statistics as a floating average, which ends
1942 * up weighing recent references more than old ones. 1942 * up weighing recent references more than old ones.
1943 * 1943 *
1944 * anon in [0], file in [1] 1944 * anon in [0], file in [1]
1945 */ 1945 */
1946 spin_lock_irq(&zone->lru_lock); 1946 spin_lock_irq(&zone->lru_lock);
1947 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1947 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1948 reclaim_stat->recent_scanned[0] /= 2; 1948 reclaim_stat->recent_scanned[0] /= 2;
1949 reclaim_stat->recent_rotated[0] /= 2; 1949 reclaim_stat->recent_rotated[0] /= 2;
1950 } 1950 }
1951 1951
1952 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1952 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1953 reclaim_stat->recent_scanned[1] /= 2; 1953 reclaim_stat->recent_scanned[1] /= 2;
1954 reclaim_stat->recent_rotated[1] /= 2; 1954 reclaim_stat->recent_rotated[1] /= 2;
1955 } 1955 }
1956 1956
1957 /* 1957 /*
1958 * The amount of pressure on anon vs file pages is inversely 1958 * The amount of pressure on anon vs file pages is inversely
1959 * proportional to the fraction of recently scanned pages on 1959 * proportional to the fraction of recently scanned pages on
1960 * each list that were recently referenced and in active use. 1960 * each list that were recently referenced and in active use.
1961 */ 1961 */
1962 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); 1962 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1963 ap /= reclaim_stat->recent_rotated[0] + 1; 1963 ap /= reclaim_stat->recent_rotated[0] + 1;
1964 1964
1965 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 1965 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1966 fp /= reclaim_stat->recent_rotated[1] + 1; 1966 fp /= reclaim_stat->recent_rotated[1] + 1;
1967 spin_unlock_irq(&zone->lru_lock); 1967 spin_unlock_irq(&zone->lru_lock);
1968 1968
1969 fraction[0] = ap; 1969 fraction[0] = ap;
1970 fraction[1] = fp; 1970 fraction[1] = fp;
1971 denominator = ap + fp + 1; 1971 denominator = ap + fp + 1;
1972 out: 1972 out:
1973 for_each_evictable_lru(lru) { 1973 for_each_evictable_lru(lru) {
1974 int file = is_file_lru(lru); 1974 int file = is_file_lru(lru);
1975 unsigned long size; 1975 unsigned long size;
1976 unsigned long scan; 1976 unsigned long scan;
1977 1977
1978 size = get_lru_size(lruvec, lru); 1978 size = get_lru_size(lruvec, lru);
1979 scan = size >> sc->priority; 1979 scan = size >> sc->priority;
1980 1980
1981 if (!scan && force_scan) 1981 if (!scan && force_scan)
1982 scan = min(size, SWAP_CLUSTER_MAX); 1982 scan = min(size, SWAP_CLUSTER_MAX);
1983 1983
1984 switch (scan_balance) { 1984 switch (scan_balance) {
1985 case SCAN_EQUAL: 1985 case SCAN_EQUAL:
1986 /* Scan lists relative to size */ 1986 /* Scan lists relative to size */
1987 break; 1987 break;
1988 case SCAN_FRACT: 1988 case SCAN_FRACT:
1989 /* 1989 /*
1990 * Scan types proportional to swappiness and 1990 * Scan types proportional to swappiness and
1991 * their relative recent reclaim efficiency. 1991 * their relative recent reclaim efficiency.
1992 */ 1992 */
1993 scan = div64_u64(scan * fraction[file], denominator); 1993 scan = div64_u64(scan * fraction[file], denominator);
1994 break; 1994 break;
1995 case SCAN_FILE: 1995 case SCAN_FILE:
1996 case SCAN_ANON: 1996 case SCAN_ANON:
1997 /* Scan one type exclusively */ 1997 /* Scan one type exclusively */
1998 if ((scan_balance == SCAN_FILE) != file) 1998 if ((scan_balance == SCAN_FILE) != file)
1999 scan = 0; 1999 scan = 0;
2000 break; 2000 break;
2001 default: 2001 default:
2002 /* Look ma, no brain */ 2002 /* Look ma, no brain */
2003 BUG(); 2003 BUG();
2004 } 2004 }
2005 nr[lru] = scan; 2005 nr[lru] = scan;
2006 } 2006 }
2007 } 2007 }
2008 2008
2009 /* 2009 /*
2010 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2010 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2011 */ 2011 */
2012 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 2012 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2013 { 2013 {
2014 unsigned long nr[NR_LRU_LISTS]; 2014 unsigned long nr[NR_LRU_LISTS];
2015 unsigned long targets[NR_LRU_LISTS]; 2015 unsigned long targets[NR_LRU_LISTS];
2016 unsigned long nr_to_scan; 2016 unsigned long nr_to_scan;
2017 enum lru_list lru; 2017 enum lru_list lru;
2018 unsigned long nr_reclaimed = 0; 2018 unsigned long nr_reclaimed = 0;
2019 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2019 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2020 struct blk_plug plug; 2020 struct blk_plug plug;
2021 bool scan_adjusted = false; 2021 bool scan_adjusted;
2022 2022
2023 get_scan_count(lruvec, sc, nr); 2023 get_scan_count(lruvec, sc, nr);
2024 2024
2025 /* Record the original scan target for proportional adjustments later */ 2025 /* Record the original scan target for proportional adjustments later */
2026 memcpy(targets, nr, sizeof(nr)); 2026 memcpy(targets, nr, sizeof(nr));
2027 2027
2028 /*
2029 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2030 * event that can occur when there is little memory pressure e.g.
2031 * multiple streaming readers/writers. Hence, we do not abort scanning
2032 * when the requested number of pages are reclaimed when scanning at
2033 * DEF_PRIORITY on the assumption that the fact we are direct
2034 * reclaiming implies that kswapd is not keeping up and it is best to
2035 * do a batch of work at once. For memcg reclaim one check is made to
2036 * abort proportional reclaim if either the file or anon lru has already
2037 * dropped to zero at the first pass.
2038 */
2039 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2040 sc->priority == DEF_PRIORITY);
2041
2028 blk_start_plug(&plug); 2042 blk_start_plug(&plug);
2029 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2043 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2030 nr[LRU_INACTIVE_FILE]) { 2044 nr[LRU_INACTIVE_FILE]) {
2031 unsigned long nr_anon, nr_file, percentage; 2045 unsigned long nr_anon, nr_file, percentage;
2032 unsigned long nr_scanned; 2046 unsigned long nr_scanned;
2033 2047
2034 for_each_evictable_lru(lru) { 2048 for_each_evictable_lru(lru) {
2035 if (nr[lru]) { 2049 if (nr[lru]) {
2036 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 2050 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2037 nr[lru] -= nr_to_scan; 2051 nr[lru] -= nr_to_scan;
2038 2052
2039 nr_reclaimed += shrink_list(lru, nr_to_scan, 2053 nr_reclaimed += shrink_list(lru, nr_to_scan,
2040 lruvec, sc); 2054 lruvec, sc);
2041 } 2055 }
2042 } 2056 }
2043 2057
2044 if (nr_reclaimed < nr_to_reclaim || scan_adjusted) 2058 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2045 continue; 2059 continue;
2046 2060
2047 /* 2061 /*
2048 * For global direct reclaim, reclaim only the number of pages
2049 * requested. Less care is taken to scan proportionally as it
2050 * is more important to minimise direct reclaim stall latency
2051 * than it is to properly age the LRU lists.
2052 */
2053 if (global_reclaim(sc) && !current_is_kswapd())
2054 break;
2055
2056 /*
2057 * For kswapd and memcg, reclaim at least the number of pages 2062 * For kswapd and memcg, reclaim at least the number of pages
2058 * requested. Ensure that the anon and file LRUs shrink 2063 * requested. Ensure that the anon and file LRUs are scanned
2059 * proportionally what was requested by get_scan_count(). We 2064 * proportionally what was requested by get_scan_count(). We
2060 * stop reclaiming one LRU and reduce the amount scanning 2065 * stop reclaiming one LRU and reduce the amount scanning
2061 * proportional to the original scan target. 2066 * proportional to the original scan target.
2062 */ 2067 */
2063 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2068 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2064 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2069 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2070
2071 /*
2072 * It's just vindictive to attack the larger once the smaller
2073 * has gone to zero. And given the way we stop scanning the
2074 * smaller below, this makes sure that we only make one nudge
2075 * towards proportionality once we've got nr_to_reclaim.
2076 */
2077 if (!nr_file || !nr_anon)
2078 break;
2065 2079
2066 if (nr_file > nr_anon) { 2080 if (nr_file > nr_anon) {
2067 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2081 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2068 targets[LRU_ACTIVE_ANON] + 1; 2082 targets[LRU_ACTIVE_ANON] + 1;
2069 lru = LRU_BASE; 2083 lru = LRU_BASE;
2070 percentage = nr_anon * 100 / scan_target; 2084 percentage = nr_anon * 100 / scan_target;
2071 } else { 2085 } else {
2072 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 2086 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2073 targets[LRU_ACTIVE_FILE] + 1; 2087 targets[LRU_ACTIVE_FILE] + 1;
2074 lru = LRU_FILE; 2088 lru = LRU_FILE;
2075 percentage = nr_file * 100 / scan_target; 2089 percentage = nr_file * 100 / scan_target;
2076 } 2090 }
2077 2091
2078 /* Stop scanning the smaller of the LRU */ 2092 /* Stop scanning the smaller of the LRU */
2079 nr[lru] = 0; 2093 nr[lru] = 0;
2080 nr[lru + LRU_ACTIVE] = 0; 2094 nr[lru + LRU_ACTIVE] = 0;
2081 2095
2082 /* 2096 /*
2083 * Recalculate the other LRU scan count based on its original 2097 * Recalculate the other LRU scan count based on its original
2084 * scan target and the percentage scanning already complete 2098 * scan target and the percentage scanning already complete
2085 */ 2099 */
2086 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 2100 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2087 nr_scanned = targets[lru] - nr[lru]; 2101 nr_scanned = targets[lru] - nr[lru];
2088 nr[lru] = targets[lru] * (100 - percentage) / 100; 2102 nr[lru] = targets[lru] * (100 - percentage) / 100;
2089 nr[lru] -= min(nr[lru], nr_scanned); 2103 nr[lru] -= min(nr[lru], nr_scanned);
2090 2104
2091 lru += LRU_ACTIVE; 2105 lru += LRU_ACTIVE;
2092 nr_scanned = targets[lru] - nr[lru]; 2106 nr_scanned = targets[lru] - nr[lru];
2093 nr[lru] = targets[lru] * (100 - percentage) / 100; 2107 nr[lru] = targets[lru] * (100 - percentage) / 100;
2094 nr[lru] -= min(nr[lru], nr_scanned); 2108 nr[lru] -= min(nr[lru], nr_scanned);
2095 2109
2096 scan_adjusted = true; 2110 scan_adjusted = true;
2097 } 2111 }
2098 blk_finish_plug(&plug); 2112 blk_finish_plug(&plug);
2099 sc->nr_reclaimed += nr_reclaimed; 2113 sc->nr_reclaimed += nr_reclaimed;
2100 2114
2101 /* 2115 /*
2102 * Even if we did not try to evict anon pages at all, we want to 2116 * Even if we did not try to evict anon pages at all, we want to
2103 * rebalance the anon lru active/inactive ratio. 2117 * rebalance the anon lru active/inactive ratio.
2104 */ 2118 */
2105 if (inactive_anon_is_low(lruvec)) 2119 if (inactive_anon_is_low(lruvec))
2106 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2120 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2107 sc, LRU_ACTIVE_ANON); 2121 sc, LRU_ACTIVE_ANON);
2108 2122
2109 throttle_vm_writeout(sc->gfp_mask); 2123 throttle_vm_writeout(sc->gfp_mask);
2110 } 2124 }
2111 2125
2112 /* Use reclaim/compaction for costly allocs or under memory pressure */ 2126 /* Use reclaim/compaction for costly allocs or under memory pressure */
2113 static bool in_reclaim_compaction(struct scan_control *sc) 2127 static bool in_reclaim_compaction(struct scan_control *sc)
2114 { 2128 {
2115 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2129 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2116 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 2130 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2117 sc->priority < DEF_PRIORITY - 2)) 2131 sc->priority < DEF_PRIORITY - 2))
2118 return true; 2132 return true;
2119 2133
2120 return false; 2134 return false;
2121 } 2135 }
2122 2136
2123 /* 2137 /*
2124 * Reclaim/compaction is used for high-order allocation requests. It reclaims 2138 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2125 * order-0 pages before compacting the zone. should_continue_reclaim() returns 2139 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2126 * true if more pages should be reclaimed such that when the page allocator 2140 * true if more pages should be reclaimed such that when the page allocator
2127 * calls try_to_compact_zone() that it will have enough free pages to succeed. 2141 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2128 * It will give up earlier than that if there is difficulty reclaiming pages. 2142 * It will give up earlier than that if there is difficulty reclaiming pages.
2129 */ 2143 */
2130 static inline bool should_continue_reclaim(struct zone *zone, 2144 static inline bool should_continue_reclaim(struct zone *zone,
2131 unsigned long nr_reclaimed, 2145 unsigned long nr_reclaimed,
2132 unsigned long nr_scanned, 2146 unsigned long nr_scanned,
2133 struct scan_control *sc) 2147 struct scan_control *sc)
2134 { 2148 {
2135 unsigned long pages_for_compaction; 2149 unsigned long pages_for_compaction;
2136 unsigned long inactive_lru_pages; 2150 unsigned long inactive_lru_pages;
2137 2151
2138 /* If not in reclaim/compaction mode, stop */ 2152 /* If not in reclaim/compaction mode, stop */
2139 if (!in_reclaim_compaction(sc)) 2153 if (!in_reclaim_compaction(sc))
2140 return false; 2154 return false;
2141 2155
2142 /* Consider stopping depending on scan and reclaim activity */ 2156 /* Consider stopping depending on scan and reclaim activity */
2143 if (sc->gfp_mask & __GFP_REPEAT) { 2157 if (sc->gfp_mask & __GFP_REPEAT) {
2144 /* 2158 /*
2145 * For __GFP_REPEAT allocations, stop reclaiming if the 2159 * For __GFP_REPEAT allocations, stop reclaiming if the
2146 * full LRU list has been scanned and we are still failing 2160 * full LRU list has been scanned and we are still failing
2147 * to reclaim pages. This full LRU scan is potentially 2161 * to reclaim pages. This full LRU scan is potentially
2148 * expensive but a __GFP_REPEAT caller really wants to succeed 2162 * expensive but a __GFP_REPEAT caller really wants to succeed
2149 */ 2163 */
2150 if (!nr_reclaimed && !nr_scanned) 2164 if (!nr_reclaimed && !nr_scanned)
2151 return false; 2165 return false;
2152 } else { 2166 } else {
2153 /* 2167 /*
2154 * For non-__GFP_REPEAT allocations which can presumably 2168 * For non-__GFP_REPEAT allocations which can presumably
2155 * fail without consequence, stop if we failed to reclaim 2169 * fail without consequence, stop if we failed to reclaim
2156 * any pages from the last SWAP_CLUSTER_MAX number of 2170 * any pages from the last SWAP_CLUSTER_MAX number of
2157 * pages that were scanned. This will return to the 2171 * pages that were scanned. This will return to the
2158 * caller faster at the risk reclaim/compaction and 2172 * caller faster at the risk reclaim/compaction and
2159 * the resulting allocation attempt fails 2173 * the resulting allocation attempt fails
2160 */ 2174 */
2161 if (!nr_reclaimed) 2175 if (!nr_reclaimed)
2162 return false; 2176 return false;
2163 } 2177 }
2164 2178
2165 /* 2179 /*
2166 * If we have not reclaimed enough pages for compaction and the 2180 * If we have not reclaimed enough pages for compaction and the
2167 * inactive lists are large enough, continue reclaiming 2181 * inactive lists are large enough, continue reclaiming
2168 */ 2182 */
2169 pages_for_compaction = (2UL << sc->order); 2183 pages_for_compaction = (2UL << sc->order);
2170 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); 2184 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
2171 if (get_nr_swap_pages() > 0) 2185 if (get_nr_swap_pages() > 0)
2172 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); 2186 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
2173 if (sc->nr_reclaimed < pages_for_compaction && 2187 if (sc->nr_reclaimed < pages_for_compaction &&
2174 inactive_lru_pages > pages_for_compaction) 2188 inactive_lru_pages > pages_for_compaction)
2175 return true; 2189 return true;
2176 2190
2177 /* If compaction would go ahead or the allocation would succeed, stop */ 2191 /* If compaction would go ahead or the allocation would succeed, stop */
2178 switch (compaction_suitable(zone, sc->order)) { 2192 switch (compaction_suitable(zone, sc->order)) {
2179 case COMPACT_PARTIAL: 2193 case COMPACT_PARTIAL:
2180 case COMPACT_CONTINUE: 2194 case COMPACT_CONTINUE:
2181 return false; 2195 return false;
2182 default: 2196 default:
2183 return true; 2197 return true;
2184 } 2198 }
2185 } 2199 }
2186 2200
2187 static void shrink_zone(struct zone *zone, struct scan_control *sc) 2201 static void shrink_zone(struct zone *zone, struct scan_control *sc)
2188 { 2202 {
2189 unsigned long nr_reclaimed, nr_scanned; 2203 unsigned long nr_reclaimed, nr_scanned;
2190 2204
2191 do { 2205 do {
2192 struct mem_cgroup *root = sc->target_mem_cgroup; 2206 struct mem_cgroup *root = sc->target_mem_cgroup;
2193 struct mem_cgroup_reclaim_cookie reclaim = { 2207 struct mem_cgroup_reclaim_cookie reclaim = {
2194 .zone = zone, 2208 .zone = zone,
2195 .priority = sc->priority, 2209 .priority = sc->priority,
2196 }; 2210 };
2197 struct mem_cgroup *memcg; 2211 struct mem_cgroup *memcg;
2198 2212
2199 nr_reclaimed = sc->nr_reclaimed; 2213 nr_reclaimed = sc->nr_reclaimed;
2200 nr_scanned = sc->nr_scanned; 2214 nr_scanned = sc->nr_scanned;
2201 2215
2202 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2216 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2203 do { 2217 do {
2204 struct lruvec *lruvec; 2218 struct lruvec *lruvec;
2205 2219
2206 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2220 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2207 2221
2208 shrink_lruvec(lruvec, sc); 2222 shrink_lruvec(lruvec, sc);
2209 2223
2210 /* 2224 /*
2211 * Direct reclaim and kswapd have to scan all memory 2225 * Direct reclaim and kswapd have to scan all memory
2212 * cgroups to fulfill the overall scan target for the 2226 * cgroups to fulfill the overall scan target for the
2213 * zone. 2227 * zone.
2214 * 2228 *
2215 * Limit reclaim, on the other hand, only cares about 2229 * Limit reclaim, on the other hand, only cares about
2216 * nr_to_reclaim pages to be reclaimed and it will 2230 * nr_to_reclaim pages to be reclaimed and it will
2217 * retry with decreasing priority if one round over the 2231 * retry with decreasing priority if one round over the
2218 * whole hierarchy is not sufficient. 2232 * whole hierarchy is not sufficient.
2219 */ 2233 */
2220 if (!global_reclaim(sc) && 2234 if (!global_reclaim(sc) &&
2221 sc->nr_reclaimed >= sc->nr_to_reclaim) { 2235 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2222 mem_cgroup_iter_break(root, memcg); 2236 mem_cgroup_iter_break(root, memcg);
2223 break; 2237 break;
2224 } 2238 }
2225 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2239 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2226 } while (memcg); 2240 } while (memcg);
2227 2241
2228 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2242 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2229 sc->nr_scanned - nr_scanned, 2243 sc->nr_scanned - nr_scanned,
2230 sc->nr_reclaimed - nr_reclaimed); 2244 sc->nr_reclaimed - nr_reclaimed);
2231 2245
2232 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2246 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2233 sc->nr_scanned - nr_scanned, sc)); 2247 sc->nr_scanned - nr_scanned, sc));
2234 } 2248 }
2235 2249
2236 /* Returns true if compaction should go ahead for a high-order request */ 2250 /* Returns true if compaction should go ahead for a high-order request */
2237 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2251 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2238 { 2252 {
2239 unsigned long balance_gap, watermark; 2253 unsigned long balance_gap, watermark;
2240 bool watermark_ok; 2254 bool watermark_ok;
2241 2255
2242 /* Do not consider compaction for orders reclaim is meant to satisfy */ 2256 /* Do not consider compaction for orders reclaim is meant to satisfy */
2243 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) 2257 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2244 return false; 2258 return false;
2245 2259
2246 /* 2260 /*
2247 * Compaction takes time to run and there are potentially other 2261 * Compaction takes time to run and there are potentially other
2248 * callers using the pages just freed. Continue reclaiming until 2262 * callers using the pages just freed. Continue reclaiming until
2249 * there is a buffer of free pages available to give compaction 2263 * there is a buffer of free pages available to give compaction
2250 * a reasonable chance of completing and allocating the page 2264 * a reasonable chance of completing and allocating the page
2251 */ 2265 */
2252 balance_gap = min(low_wmark_pages(zone), 2266 balance_gap = min(low_wmark_pages(zone),
2253 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2267 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2254 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2268 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2255 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2269 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2256 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2270 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2257 2271
2258 /* 2272 /*
2259 * If compaction is deferred, reclaim up to a point where 2273 * If compaction is deferred, reclaim up to a point where
2260 * compaction will have a chance of success when re-enabled 2274 * compaction will have a chance of success when re-enabled
2261 */ 2275 */
2262 if (compaction_deferred(zone, sc->order)) 2276 if (compaction_deferred(zone, sc->order))
2263 return watermark_ok; 2277 return watermark_ok;
2264 2278
2265 /* If compaction is not ready to start, keep reclaiming */ 2279 /* If compaction is not ready to start, keep reclaiming */
2266 if (!compaction_suitable(zone, sc->order)) 2280 if (!compaction_suitable(zone, sc->order))
2267 return false; 2281 return false;
2268 2282
2269 return watermark_ok; 2283 return watermark_ok;
2270 } 2284 }
2271 2285
2272 /* 2286 /*
2273 * This is the direct reclaim path, for page-allocating processes. We only 2287 * This is the direct reclaim path, for page-allocating processes. We only
2274 * try to reclaim pages from zones which will satisfy the caller's allocation 2288 * try to reclaim pages from zones which will satisfy the caller's allocation
2275 * request. 2289 * request.
2276 * 2290 *
2277 * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2291 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2278 * Because: 2292 * Because:
2279 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2293 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2280 * allocation or 2294 * allocation or
2281 * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2295 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2282 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2296 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2283 * zone defense algorithm. 2297 * zone defense algorithm.
2284 * 2298 *
2285 * If a zone is deemed to be full of pinned pages then just give it a light 2299 * If a zone is deemed to be full of pinned pages then just give it a light
2286 * scan then give up on it. 2300 * scan then give up on it.
2287 * 2301 *
2288 * This function returns true if a zone is being reclaimed for a costly 2302 * This function returns true if a zone is being reclaimed for a costly
2289 * high-order allocation and compaction is ready to begin. This indicates to 2303 * high-order allocation and compaction is ready to begin. This indicates to
2290 * the caller that it should consider retrying the allocation instead of 2304 * the caller that it should consider retrying the allocation instead of
2291 * further reclaim. 2305 * further reclaim.
2292 */ 2306 */
2293 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 2307 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2294 { 2308 {
2295 struct zoneref *z; 2309 struct zoneref *z;
2296 struct zone *zone; 2310 struct zone *zone;
2297 unsigned long nr_soft_reclaimed; 2311 unsigned long nr_soft_reclaimed;
2298 unsigned long nr_soft_scanned; 2312 unsigned long nr_soft_scanned;
2299 bool aborted_reclaim = false; 2313 bool aborted_reclaim = false;
2300 2314
2301 /* 2315 /*
2302 * If the number of buffer_heads in the machine exceeds the maximum 2316 * If the number of buffer_heads in the machine exceeds the maximum
2303 * allowed level, force direct reclaim to scan the highmem zone as 2317 * allowed level, force direct reclaim to scan the highmem zone as
2304 * highmem pages could be pinning lowmem pages storing buffer_heads 2318 * highmem pages could be pinning lowmem pages storing buffer_heads
2305 */ 2319 */
2306 if (buffer_heads_over_limit) 2320 if (buffer_heads_over_limit)
2307 sc->gfp_mask |= __GFP_HIGHMEM; 2321 sc->gfp_mask |= __GFP_HIGHMEM;
2308 2322
2309 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2323 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2310 gfp_zone(sc->gfp_mask), sc->nodemask) { 2324 gfp_zone(sc->gfp_mask), sc->nodemask) {
2311 if (!populated_zone(zone)) 2325 if (!populated_zone(zone))
2312 continue; 2326 continue;
2313 /* 2327 /*
2314 * Take care memory controller reclaiming has small influence 2328 * Take care memory controller reclaiming has small influence
2315 * to global LRU. 2329 * to global LRU.
2316 */ 2330 */
2317 if (global_reclaim(sc)) { 2331 if (global_reclaim(sc)) {
2318 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2332 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2319 continue; 2333 continue;
2320 if (sc->priority != DEF_PRIORITY && 2334 if (sc->priority != DEF_PRIORITY &&
2321 !zone_reclaimable(zone)) 2335 !zone_reclaimable(zone))
2322 continue; /* Let kswapd poll it */ 2336 continue; /* Let kswapd poll it */
2323 if (IS_ENABLED(CONFIG_COMPACTION)) { 2337 if (IS_ENABLED(CONFIG_COMPACTION)) {
2324 /* 2338 /*
2325 * If we already have plenty of memory free for 2339 * If we already have plenty of memory free for
2326 * compaction in this zone, don't free any more. 2340 * compaction in this zone, don't free any more.
2327 * Even though compaction is invoked for any 2341 * Even though compaction is invoked for any
2328 * non-zero order, only frequent costly order 2342 * non-zero order, only frequent costly order
2329 * reclamation is disruptive enough to become a 2343 * reclamation is disruptive enough to become a
2330 * noticeable problem, like transparent huge 2344 * noticeable problem, like transparent huge
2331 * page allocations. 2345 * page allocations.
2332 */ 2346 */
2333 if (compaction_ready(zone, sc)) { 2347 if (compaction_ready(zone, sc)) {
2334 aborted_reclaim = true; 2348 aborted_reclaim = true;
2335 continue; 2349 continue;
2336 } 2350 }
2337 } 2351 }
2338 /* 2352 /*
2339 * This steals pages from memory cgroups over softlimit 2353 * This steals pages from memory cgroups over softlimit
2340 * and returns the number of reclaimed pages and 2354 * and returns the number of reclaimed pages and
2341 * scanned pages. This works for global memory pressure 2355 * scanned pages. This works for global memory pressure
2342 * and balancing, not for a memcg's limit. 2356 * and balancing, not for a memcg's limit.
2343 */ 2357 */
2344 nr_soft_scanned = 0; 2358 nr_soft_scanned = 0;
2345 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2359 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2346 sc->order, sc->gfp_mask, 2360 sc->order, sc->gfp_mask,
2347 &nr_soft_scanned); 2361 &nr_soft_scanned);
2348 sc->nr_reclaimed += nr_soft_reclaimed; 2362 sc->nr_reclaimed += nr_soft_reclaimed;
2349 sc->nr_scanned += nr_soft_scanned; 2363 sc->nr_scanned += nr_soft_scanned;
2350 /* need some check for avoid more shrink_zone() */ 2364 /* need some check for avoid more shrink_zone() */
2351 } 2365 }
2352 2366
2353 shrink_zone(zone, sc); 2367 shrink_zone(zone, sc);
2354 } 2368 }
2355 2369
2356 return aborted_reclaim; 2370 return aborted_reclaim;
2357 } 2371 }
2358 2372
2359 /* All zones in zonelist are unreclaimable? */ 2373 /* All zones in zonelist are unreclaimable? */
2360 static bool all_unreclaimable(struct zonelist *zonelist, 2374 static bool all_unreclaimable(struct zonelist *zonelist,
2361 struct scan_control *sc) 2375 struct scan_control *sc)
2362 { 2376 {
2363 struct zoneref *z; 2377 struct zoneref *z;
2364 struct zone *zone; 2378 struct zone *zone;
2365 2379
2366 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2380 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2367 gfp_zone(sc->gfp_mask), sc->nodemask) { 2381 gfp_zone(sc->gfp_mask), sc->nodemask) {
2368 if (!populated_zone(zone)) 2382 if (!populated_zone(zone))
2369 continue; 2383 continue;
2370 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2384 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2371 continue; 2385 continue;
2372 if (zone_reclaimable(zone)) 2386 if (zone_reclaimable(zone))
2373 return false; 2387 return false;
2374 } 2388 }
2375 2389
2376 return true; 2390 return true;
2377 } 2391 }
2378 2392
2379 /* 2393 /*
2380 * This is the main entry point to direct page reclaim. 2394 * This is the main entry point to direct page reclaim.
2381 * 2395 *
2382 * If a full scan of the inactive list fails to free enough memory then we 2396 * If a full scan of the inactive list fails to free enough memory then we
2383 * are "out of memory" and something needs to be killed. 2397 * are "out of memory" and something needs to be killed.
2384 * 2398 *
2385 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2399 * If the caller is !__GFP_FS then the probability of a failure is reasonably
2386 * high - the zone may be full of dirty or under-writeback pages, which this 2400 * high - the zone may be full of dirty or under-writeback pages, which this
2387 * caller can't do much about. We kick the writeback threads and take explicit 2401 * caller can't do much about. We kick the writeback threads and take explicit
2388 * naps in the hope that some of these pages can be written. But if the 2402 * naps in the hope that some of these pages can be written. But if the
2389 * allocating task holds filesystem locks which prevent writeout this might not 2403 * allocating task holds filesystem locks which prevent writeout this might not
2390 * work, and the allocation attempt will fail. 2404 * work, and the allocation attempt will fail.
2391 * 2405 *
2392 * returns: 0, if no pages reclaimed 2406 * returns: 0, if no pages reclaimed
2393 * else, the number of pages reclaimed 2407 * else, the number of pages reclaimed
2394 */ 2408 */
2395 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2409 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2396 struct scan_control *sc, 2410 struct scan_control *sc,
2397 struct shrink_control *shrink) 2411 struct shrink_control *shrink)
2398 { 2412 {
2399 unsigned long total_scanned = 0; 2413 unsigned long total_scanned = 0;
2400 struct reclaim_state *reclaim_state = current->reclaim_state; 2414 struct reclaim_state *reclaim_state = current->reclaim_state;
2401 struct zoneref *z; 2415 struct zoneref *z;
2402 struct zone *zone; 2416 struct zone *zone;
2403 unsigned long writeback_threshold; 2417 unsigned long writeback_threshold;
2404 bool aborted_reclaim; 2418 bool aborted_reclaim;
2405 2419
2406 delayacct_freepages_start(); 2420 delayacct_freepages_start();
2407 2421
2408 if (global_reclaim(sc)) 2422 if (global_reclaim(sc))
2409 count_vm_event(ALLOCSTALL); 2423 count_vm_event(ALLOCSTALL);
2410 2424
2411 do { 2425 do {
2412 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2426 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2413 sc->priority); 2427 sc->priority);
2414 sc->nr_scanned = 0; 2428 sc->nr_scanned = 0;
2415 aborted_reclaim = shrink_zones(zonelist, sc); 2429 aborted_reclaim = shrink_zones(zonelist, sc);
2416 2430
2417 /* 2431 /*
2418 * Don't shrink slabs when reclaiming memory from over limit 2432 * Don't shrink slabs when reclaiming memory from over limit
2419 * cgroups but do shrink slab at least once when aborting 2433 * cgroups but do shrink slab at least once when aborting
2420 * reclaim for compaction to avoid unevenly scanning file/anon 2434 * reclaim for compaction to avoid unevenly scanning file/anon
2421 * LRU pages over slab pages. 2435 * LRU pages over slab pages.
2422 */ 2436 */
2423 if (global_reclaim(sc)) { 2437 if (global_reclaim(sc)) {
2424 unsigned long lru_pages = 0; 2438 unsigned long lru_pages = 0;
2425 2439
2426 nodes_clear(shrink->nodes_to_scan); 2440 nodes_clear(shrink->nodes_to_scan);
2427 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2441 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2428 gfp_zone(sc->gfp_mask), sc->nodemask) { 2442 gfp_zone(sc->gfp_mask), sc->nodemask) {
2429 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2443 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2430 continue; 2444 continue;
2431 2445
2432 lru_pages += zone_reclaimable_pages(zone); 2446 lru_pages += zone_reclaimable_pages(zone);
2433 node_set(zone_to_nid(zone), 2447 node_set(zone_to_nid(zone),
2434 shrink->nodes_to_scan); 2448 shrink->nodes_to_scan);
2435 } 2449 }
2436 2450
2437 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2451 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2438 if (reclaim_state) { 2452 if (reclaim_state) {
2439 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2453 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2440 reclaim_state->reclaimed_slab = 0; 2454 reclaim_state->reclaimed_slab = 0;
2441 } 2455 }
2442 } 2456 }
2443 total_scanned += sc->nr_scanned; 2457 total_scanned += sc->nr_scanned;
2444 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2458 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2445 goto out; 2459 goto out;
2446 2460
2447 /* 2461 /*
2448 * If we're getting trouble reclaiming, start doing 2462 * If we're getting trouble reclaiming, start doing
2449 * writepage even in laptop mode. 2463 * writepage even in laptop mode.
2450 */ 2464 */
2451 if (sc->priority < DEF_PRIORITY - 2) 2465 if (sc->priority < DEF_PRIORITY - 2)
2452 sc->may_writepage = 1; 2466 sc->may_writepage = 1;
2453 2467
2454 /* 2468 /*
2455 * Try to write back as many pages as we just scanned. This 2469 * Try to write back as many pages as we just scanned. This
2456 * tends to cause slow streaming writers to write data to the 2470 * tends to cause slow streaming writers to write data to the
2457 * disk smoothly, at the dirtying rate, which is nice. But 2471 * disk smoothly, at the dirtying rate, which is nice. But
2458 * that's undesirable in laptop mode, where we *want* lumpy 2472 * that's undesirable in laptop mode, where we *want* lumpy
2459 * writeout. So in laptop mode, write out the whole world. 2473 * writeout. So in laptop mode, write out the whole world.
2460 */ 2474 */
2461 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2475 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2462 if (total_scanned > writeback_threshold) { 2476 if (total_scanned > writeback_threshold) {
2463 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, 2477 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2464 WB_REASON_TRY_TO_FREE_PAGES); 2478 WB_REASON_TRY_TO_FREE_PAGES);
2465 sc->may_writepage = 1; 2479 sc->may_writepage = 1;
2466 } 2480 }
2467 } while (--sc->priority >= 0 && !aborted_reclaim); 2481 } while (--sc->priority >= 0 && !aborted_reclaim);
2468 2482
2469 out: 2483 out:
2470 delayacct_freepages_end(); 2484 delayacct_freepages_end();
2471 2485
2472 if (sc->nr_reclaimed) 2486 if (sc->nr_reclaimed)
2473 return sc->nr_reclaimed; 2487 return sc->nr_reclaimed;
2474 2488
2475 /* 2489 /*
2476 * As hibernation is going on, kswapd is freezed so that it can't mark 2490 * As hibernation is going on, kswapd is freezed so that it can't mark
2477 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2491 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2478 * check. 2492 * check.
2479 */ 2493 */
2480 if (oom_killer_disabled) 2494 if (oom_killer_disabled)
2481 return 0; 2495 return 0;
2482 2496
2483 /* Aborted reclaim to try compaction? don't OOM, then */ 2497 /* Aborted reclaim to try compaction? don't OOM, then */
2484 if (aborted_reclaim) 2498 if (aborted_reclaim)
2485 return 1; 2499 return 1;
2486 2500
2487 /* top priority shrink_zones still had more to do? don't OOM, then */ 2501 /* top priority shrink_zones still had more to do? don't OOM, then */
2488 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) 2502 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2489 return 1; 2503 return 1;
2490 2504
2491 return 0; 2505 return 0;
2492 } 2506 }
2493 2507
2494 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) 2508 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2495 { 2509 {
2496 struct zone *zone; 2510 struct zone *zone;
2497 unsigned long pfmemalloc_reserve = 0; 2511 unsigned long pfmemalloc_reserve = 0;
2498 unsigned long free_pages = 0; 2512 unsigned long free_pages = 0;
2499 int i; 2513 int i;
2500 bool wmark_ok; 2514 bool wmark_ok;
2501 2515
2502 for (i = 0; i <= ZONE_NORMAL; i++) { 2516 for (i = 0; i <= ZONE_NORMAL; i++) {
2503 zone = &pgdat->node_zones[i]; 2517 zone = &pgdat->node_zones[i];
2504 if (!populated_zone(zone)) 2518 if (!populated_zone(zone))
2505 continue; 2519 continue;
2506 2520
2507 pfmemalloc_reserve += min_wmark_pages(zone); 2521 pfmemalloc_reserve += min_wmark_pages(zone);
2508 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2522 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2509 } 2523 }
2510 2524
2511 /* If there are no reserves (unexpected config) then do not throttle */ 2525 /* If there are no reserves (unexpected config) then do not throttle */
2512 if (!pfmemalloc_reserve) 2526 if (!pfmemalloc_reserve)
2513 return true; 2527 return true;
2514 2528
2515 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2529 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2516 2530
2517 /* kswapd must be awake if processes are being throttled */ 2531 /* kswapd must be awake if processes are being throttled */
2518 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 2532 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2519 pgdat->classzone_idx = min(pgdat->classzone_idx, 2533 pgdat->classzone_idx = min(pgdat->classzone_idx,
2520 (enum zone_type)ZONE_NORMAL); 2534 (enum zone_type)ZONE_NORMAL);
2521 wake_up_interruptible(&pgdat->kswapd_wait); 2535 wake_up_interruptible(&pgdat->kswapd_wait);
2522 } 2536 }
2523 2537
2524 return wmark_ok; 2538 return wmark_ok;
2525 } 2539 }
2526 2540
2527 /* 2541 /*
2528 * Throttle direct reclaimers if backing storage is backed by the network 2542 * Throttle direct reclaimers if backing storage is backed by the network
2529 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2543 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2530 * depleted. kswapd will continue to make progress and wake the processes 2544 * depleted. kswapd will continue to make progress and wake the processes
2531 * when the low watermark is reached. 2545 * when the low watermark is reached.
2532 * 2546 *
2533 * Returns true if a fatal signal was delivered during throttling. If this 2547 * Returns true if a fatal signal was delivered during throttling. If this
2534 * happens, the page allocator should not consider triggering the OOM killer. 2548 * happens, the page allocator should not consider triggering the OOM killer.
2535 */ 2549 */
2536 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2550 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2537 nodemask_t *nodemask) 2551 nodemask_t *nodemask)
2538 { 2552 {
2539 struct zoneref *z; 2553 struct zoneref *z;
2540 struct zone *zone; 2554 struct zone *zone;
2541 pg_data_t *pgdat = NULL; 2555 pg_data_t *pgdat = NULL;
2542 2556
2543 /* 2557 /*
2544 * Kernel threads should not be throttled as they may be indirectly 2558 * Kernel threads should not be throttled as they may be indirectly
2545 * responsible for cleaning pages necessary for reclaim to make forward 2559 * responsible for cleaning pages necessary for reclaim to make forward
2546 * progress. kjournald for example may enter direct reclaim while 2560 * progress. kjournald for example may enter direct reclaim while
2547 * committing a transaction where throttling it could forcing other 2561 * committing a transaction where throttling it could forcing other
2548 * processes to block on log_wait_commit(). 2562 * processes to block on log_wait_commit().
2549 */ 2563 */
2550 if (current->flags & PF_KTHREAD) 2564 if (current->flags & PF_KTHREAD)
2551 goto out; 2565 goto out;
2552 2566
2553 /* 2567 /*
2554 * If a fatal signal is pending, this process should not throttle. 2568 * If a fatal signal is pending, this process should not throttle.
2555 * It should return quickly so it can exit and free its memory 2569 * It should return quickly so it can exit and free its memory
2556 */ 2570 */
2557 if (fatal_signal_pending(current)) 2571 if (fatal_signal_pending(current))
2558 goto out; 2572 goto out;
2559 2573
2560 /* 2574 /*
2561 * Check if the pfmemalloc reserves are ok by finding the first node 2575 * Check if the pfmemalloc reserves are ok by finding the first node
2562 * with a usable ZONE_NORMAL or lower zone. The expectation is that 2576 * with a usable ZONE_NORMAL or lower zone. The expectation is that
2563 * GFP_KERNEL will be required for allocating network buffers when 2577 * GFP_KERNEL will be required for allocating network buffers when
2564 * swapping over the network so ZONE_HIGHMEM is unusable. 2578 * swapping over the network so ZONE_HIGHMEM is unusable.
2565 * 2579 *
2566 * Throttling is based on the first usable node and throttled processes 2580 * Throttling is based on the first usable node and throttled processes
2567 * wait on a queue until kswapd makes progress and wakes them. There 2581 * wait on a queue until kswapd makes progress and wakes them. There
2568 * is an affinity then between processes waking up and where reclaim 2582 * is an affinity then between processes waking up and where reclaim
2569 * progress has been made assuming the process wakes on the same node. 2583 * progress has been made assuming the process wakes on the same node.
2570 * More importantly, processes running on remote nodes will not compete 2584 * More importantly, processes running on remote nodes will not compete
2571 * for remote pfmemalloc reserves and processes on different nodes 2585 * for remote pfmemalloc reserves and processes on different nodes
2572 * should make reasonable progress. 2586 * should make reasonable progress.
2573 */ 2587 */
2574 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2588 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2575 gfp_mask, nodemask) { 2589 gfp_mask, nodemask) {
2576 if (zone_idx(zone) > ZONE_NORMAL) 2590 if (zone_idx(zone) > ZONE_NORMAL)
2577 continue; 2591 continue;
2578 2592
2579 /* Throttle based on the first usable node */ 2593 /* Throttle based on the first usable node */
2580 pgdat = zone->zone_pgdat; 2594 pgdat = zone->zone_pgdat;
2581 if (pfmemalloc_watermark_ok(pgdat)) 2595 if (pfmemalloc_watermark_ok(pgdat))
2582 goto out; 2596 goto out;
2583 break; 2597 break;
2584 } 2598 }
2585 2599
2586 /* If no zone was usable by the allocation flags then do not throttle */ 2600 /* If no zone was usable by the allocation flags then do not throttle */
2587 if (!pgdat) 2601 if (!pgdat)
2588 goto out; 2602 goto out;
2589 2603
2590 /* Account for the throttling */ 2604 /* Account for the throttling */
2591 count_vm_event(PGSCAN_DIRECT_THROTTLE); 2605 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2592 2606
2593 /* 2607 /*
2594 * If the caller cannot enter the filesystem, it's possible that it 2608 * If the caller cannot enter the filesystem, it's possible that it
2595 * is due to the caller holding an FS lock or performing a journal 2609 * is due to the caller holding an FS lock or performing a journal
2596 * transaction in the case of a filesystem like ext[3|4]. In this case, 2610 * transaction in the case of a filesystem like ext[3|4]. In this case,
2597 * it is not safe to block on pfmemalloc_wait as kswapd could be 2611 * it is not safe to block on pfmemalloc_wait as kswapd could be
2598 * blocked waiting on the same lock. Instead, throttle for up to a 2612 * blocked waiting on the same lock. Instead, throttle for up to a
2599 * second before continuing. 2613 * second before continuing.
2600 */ 2614 */
2601 if (!(gfp_mask & __GFP_FS)) { 2615 if (!(gfp_mask & __GFP_FS)) {
2602 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2616 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2603 pfmemalloc_watermark_ok(pgdat), HZ); 2617 pfmemalloc_watermark_ok(pgdat), HZ);
2604 2618
2605 goto check_pending; 2619 goto check_pending;
2606 } 2620 }
2607 2621
2608 /* Throttle until kswapd wakes the process */ 2622 /* Throttle until kswapd wakes the process */
2609 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2623 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2610 pfmemalloc_watermark_ok(pgdat)); 2624 pfmemalloc_watermark_ok(pgdat));
2611 2625
2612 check_pending: 2626 check_pending:
2613 if (fatal_signal_pending(current)) 2627 if (fatal_signal_pending(current))
2614 return true; 2628 return true;
2615 2629
2616 out: 2630 out:
2617 return false; 2631 return false;
2618 } 2632 }
2619 2633
2620 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2634 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2621 gfp_t gfp_mask, nodemask_t *nodemask) 2635 gfp_t gfp_mask, nodemask_t *nodemask)
2622 { 2636 {
2623 unsigned long nr_reclaimed; 2637 unsigned long nr_reclaimed;
2624 struct scan_control sc = { 2638 struct scan_control sc = {
2625 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 2639 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2626 .may_writepage = !laptop_mode, 2640 .may_writepage = !laptop_mode,
2627 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2641 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2628 .may_unmap = 1, 2642 .may_unmap = 1,
2629 .may_swap = 1, 2643 .may_swap = 1,
2630 .order = order, 2644 .order = order,
2631 .priority = DEF_PRIORITY, 2645 .priority = DEF_PRIORITY,
2632 .target_mem_cgroup = NULL, 2646 .target_mem_cgroup = NULL,
2633 .nodemask = nodemask, 2647 .nodemask = nodemask,
2634 }; 2648 };
2635 struct shrink_control shrink = { 2649 struct shrink_control shrink = {
2636 .gfp_mask = sc.gfp_mask, 2650 .gfp_mask = sc.gfp_mask,
2637 }; 2651 };
2638 2652
2639 /* 2653 /*
2640 * Do not enter reclaim if fatal signal was delivered while throttled. 2654 * Do not enter reclaim if fatal signal was delivered while throttled.
2641 * 1 is returned so that the page allocator does not OOM kill at this 2655 * 1 is returned so that the page allocator does not OOM kill at this
2642 * point. 2656 * point.
2643 */ 2657 */
2644 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) 2658 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2645 return 1; 2659 return 1;
2646 2660
2647 trace_mm_vmscan_direct_reclaim_begin(order, 2661 trace_mm_vmscan_direct_reclaim_begin(order,
2648 sc.may_writepage, 2662 sc.may_writepage,
2649 gfp_mask); 2663 gfp_mask);
2650 2664
2651 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2665 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2652 2666
2653 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2667 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2654 2668
2655 return nr_reclaimed; 2669 return nr_reclaimed;
2656 } 2670 }
2657 2671
2658 #ifdef CONFIG_MEMCG 2672 #ifdef CONFIG_MEMCG
2659 2673
2660 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2674 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2661 gfp_t gfp_mask, bool noswap, 2675 gfp_t gfp_mask, bool noswap,
2662 struct zone *zone, 2676 struct zone *zone,
2663 unsigned long *nr_scanned) 2677 unsigned long *nr_scanned)
2664 { 2678 {
2665 struct scan_control sc = { 2679 struct scan_control sc = {
2666 .nr_scanned = 0, 2680 .nr_scanned = 0,
2667 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2681 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2668 .may_writepage = !laptop_mode, 2682 .may_writepage = !laptop_mode,
2669 .may_unmap = 1, 2683 .may_unmap = 1,
2670 .may_swap = !noswap, 2684 .may_swap = !noswap,
2671 .order = 0, 2685 .order = 0,
2672 .priority = 0, 2686 .priority = 0,
2673 .target_mem_cgroup = memcg, 2687 .target_mem_cgroup = memcg,
2674 }; 2688 };
2675 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2689 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2676 2690
2677 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2691 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2678 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2692 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2679 2693
2680 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 2694 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2681 sc.may_writepage, 2695 sc.may_writepage,
2682 sc.gfp_mask); 2696 sc.gfp_mask);
2683 2697
2684 /* 2698 /*
2685 * NOTE: Although we can get the priority field, using it 2699 * NOTE: Although we can get the priority field, using it
2686 * here is not a good idea, since it limits the pages we can scan. 2700 * here is not a good idea, since it limits the pages we can scan.
2687 * if we don't reclaim here, the shrink_zone from balance_pgdat 2701 * if we don't reclaim here, the shrink_zone from balance_pgdat
2688 * will pick up pages from other mem cgroup's as well. We hack 2702 * will pick up pages from other mem cgroup's as well. We hack
2689 * the priority and make it zero. 2703 * the priority and make it zero.
2690 */ 2704 */
2691 shrink_lruvec(lruvec, &sc); 2705 shrink_lruvec(lruvec, &sc);
2692 2706
2693 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2707 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2694 2708
2695 *nr_scanned = sc.nr_scanned; 2709 *nr_scanned = sc.nr_scanned;
2696 return sc.nr_reclaimed; 2710 return sc.nr_reclaimed;
2697 } 2711 }
2698 2712
2699 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 2713 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2700 gfp_t gfp_mask, 2714 gfp_t gfp_mask,
2701 bool noswap) 2715 bool noswap)
2702 { 2716 {
2703 struct zonelist *zonelist; 2717 struct zonelist *zonelist;
2704 unsigned long nr_reclaimed; 2718 unsigned long nr_reclaimed;
2705 int nid; 2719 int nid;
2706 struct scan_control sc = { 2720 struct scan_control sc = {
2707 .may_writepage = !laptop_mode, 2721 .may_writepage = !laptop_mode,
2708 .may_unmap = 1, 2722 .may_unmap = 1,
2709 .may_swap = !noswap, 2723 .may_swap = !noswap,
2710 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2724 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2711 .order = 0, 2725 .order = 0,
2712 .priority = DEF_PRIORITY, 2726 .priority = DEF_PRIORITY,
2713 .target_mem_cgroup = memcg, 2727 .target_mem_cgroup = memcg,
2714 .nodemask = NULL, /* we don't care the placement */ 2728 .nodemask = NULL, /* we don't care the placement */
2715 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2729 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2716 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2730 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2717 }; 2731 };
2718 struct shrink_control shrink = { 2732 struct shrink_control shrink = {
2719 .gfp_mask = sc.gfp_mask, 2733 .gfp_mask = sc.gfp_mask,
2720 }; 2734 };
2721 2735
2722 /* 2736 /*
2723 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2737 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2724 * take care of from where we get pages. So the node where we start the 2738 * take care of from where we get pages. So the node where we start the
2725 * scan does not need to be the current node. 2739 * scan does not need to be the current node.
2726 */ 2740 */
2727 nid = mem_cgroup_select_victim_node(memcg); 2741 nid = mem_cgroup_select_victim_node(memcg);
2728 2742
2729 zonelist = NODE_DATA(nid)->node_zonelists; 2743 zonelist = NODE_DATA(nid)->node_zonelists;
2730 2744
2731 trace_mm_vmscan_memcg_reclaim_begin(0, 2745 trace_mm_vmscan_memcg_reclaim_begin(0,
2732 sc.may_writepage, 2746 sc.may_writepage,
2733 sc.gfp_mask); 2747 sc.gfp_mask);
2734 2748
2735 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2749 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2736 2750
2737 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2751 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2738 2752
2739 return nr_reclaimed; 2753 return nr_reclaimed;
2740 } 2754 }
2741 #endif 2755 #endif
2742 2756
2743 static void age_active_anon(struct zone *zone, struct scan_control *sc) 2757 static void age_active_anon(struct zone *zone, struct scan_control *sc)
2744 { 2758 {
2745 struct mem_cgroup *memcg; 2759 struct mem_cgroup *memcg;
2746 2760
2747 if (!total_swap_pages) 2761 if (!total_swap_pages)
2748 return; 2762 return;
2749 2763
2750 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2764 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2751 do { 2765 do {
2752 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2766 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2753 2767
2754 if (inactive_anon_is_low(lruvec)) 2768 if (inactive_anon_is_low(lruvec))
2755 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2769 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2756 sc, LRU_ACTIVE_ANON); 2770 sc, LRU_ACTIVE_ANON);
2757 2771
2758 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2772 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2759 } while (memcg); 2773 } while (memcg);
2760 } 2774 }
2761 2775
2762 static bool zone_balanced(struct zone *zone, int order, 2776 static bool zone_balanced(struct zone *zone, int order,
2763 unsigned long balance_gap, int classzone_idx) 2777 unsigned long balance_gap, int classzone_idx)
2764 { 2778 {
2765 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + 2779 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2766 balance_gap, classzone_idx, 0)) 2780 balance_gap, classzone_idx, 0))
2767 return false; 2781 return false;
2768 2782
2769 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2783 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2770 !compaction_suitable(zone, order)) 2784 !compaction_suitable(zone, order))
2771 return false; 2785 return false;
2772 2786
2773 return true; 2787 return true;
2774 } 2788 }
2775 2789
2776 /* 2790 /*
2777 * pgdat_balanced() is used when checking if a node is balanced. 2791 * pgdat_balanced() is used when checking if a node is balanced.
2778 * 2792 *
2779 * For order-0, all zones must be balanced! 2793 * For order-0, all zones must be balanced!
2780 * 2794 *
2781 * For high-order allocations only zones that meet watermarks and are in a 2795 * For high-order allocations only zones that meet watermarks and are in a
2782 * zone allowed by the callers classzone_idx are added to balanced_pages. The 2796 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2783 * total of balanced pages must be at least 25% of the zones allowed by 2797 * total of balanced pages must be at least 25% of the zones allowed by
2784 * classzone_idx for the node to be considered balanced. Forcing all zones to 2798 * classzone_idx for the node to be considered balanced. Forcing all zones to
2785 * be balanced for high orders can cause excessive reclaim when there are 2799 * be balanced for high orders can cause excessive reclaim when there are
2786 * imbalanced zones. 2800 * imbalanced zones.
2787 * The choice of 25% is due to 2801 * The choice of 25% is due to
2788 * o a 16M DMA zone that is balanced will not balance a zone on any 2802 * o a 16M DMA zone that is balanced will not balance a zone on any
2789 * reasonable sized machine 2803 * reasonable sized machine
2790 * o On all other machines, the top zone must be at least a reasonable 2804 * o On all other machines, the top zone must be at least a reasonable
2791 * percentage of the middle zones. For example, on 32-bit x86, highmem 2805 * percentage of the middle zones. For example, on 32-bit x86, highmem
2792 * would need to be at least 256M for it to be balance a whole node. 2806 * would need to be at least 256M for it to be balance a whole node.
2793 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2807 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2794 * to balance a node on its own. These seemed like reasonable ratios. 2808 * to balance a node on its own. These seemed like reasonable ratios.
2795 */ 2809 */
2796 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2810 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2797 { 2811 {
2798 unsigned long managed_pages = 0; 2812 unsigned long managed_pages = 0;
2799 unsigned long balanced_pages = 0; 2813 unsigned long balanced_pages = 0;
2800 int i; 2814 int i;
2801 2815
2802 /* Check the watermark levels */ 2816 /* Check the watermark levels */
2803 for (i = 0; i <= classzone_idx; i++) { 2817 for (i = 0; i <= classzone_idx; i++) {
2804 struct zone *zone = pgdat->node_zones + i; 2818 struct zone *zone = pgdat->node_zones + i;
2805 2819
2806 if (!populated_zone(zone)) 2820 if (!populated_zone(zone))
2807 continue; 2821 continue;
2808 2822
2809 managed_pages += zone->managed_pages; 2823 managed_pages += zone->managed_pages;
2810 2824
2811 /* 2825 /*
2812 * A special case here: 2826 * A special case here:
2813 * 2827 *
2814 * balance_pgdat() skips over all_unreclaimable after 2828 * balance_pgdat() skips over all_unreclaimable after
2815 * DEF_PRIORITY. Effectively, it considers them balanced so 2829 * DEF_PRIORITY. Effectively, it considers them balanced so
2816 * they must be considered balanced here as well! 2830 * they must be considered balanced here as well!
2817 */ 2831 */
2818 if (!zone_reclaimable(zone)) { 2832 if (!zone_reclaimable(zone)) {
2819 balanced_pages += zone->managed_pages; 2833 balanced_pages += zone->managed_pages;
2820 continue; 2834 continue;
2821 } 2835 }
2822 2836
2823 if (zone_balanced(zone, order, 0, i)) 2837 if (zone_balanced(zone, order, 0, i))
2824 balanced_pages += zone->managed_pages; 2838 balanced_pages += zone->managed_pages;
2825 else if (!order) 2839 else if (!order)
2826 return false; 2840 return false;
2827 } 2841 }
2828 2842
2829 if (order) 2843 if (order)
2830 return balanced_pages >= (managed_pages >> 2); 2844 return balanced_pages >= (managed_pages >> 2);
2831 else 2845 else
2832 return true; 2846 return true;
2833 } 2847 }
2834 2848
2835 /* 2849 /*
2836 * Prepare kswapd for sleeping. This verifies that there are no processes 2850 * Prepare kswapd for sleeping. This verifies that there are no processes
2837 * waiting in throttle_direct_reclaim() and that watermarks have been met. 2851 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2838 * 2852 *
2839 * Returns true if kswapd is ready to sleep 2853 * Returns true if kswapd is ready to sleep
2840 */ 2854 */
2841 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2855 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2842 int classzone_idx) 2856 int classzone_idx)
2843 { 2857 {
2844 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2858 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2845 if (remaining) 2859 if (remaining)
2846 return false; 2860 return false;
2847 2861
2848 /* 2862 /*
2849 * There is a potential race between when kswapd checks its watermarks 2863 * There is a potential race between when kswapd checks its watermarks
2850 * and a process gets throttled. There is also a potential race if 2864 * and a process gets throttled. There is also a potential race if
2851 * processes get throttled, kswapd wakes, a large process exits therby 2865 * processes get throttled, kswapd wakes, a large process exits therby
2852 * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2866 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2853 * is going to sleep, no process should be sleeping on pfmemalloc_wait 2867 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2854 * so wake them now if necessary. If necessary, processes will wake 2868 * so wake them now if necessary. If necessary, processes will wake
2855 * kswapd and get throttled again 2869 * kswapd and get throttled again
2856 */ 2870 */
2857 if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2871 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2858 wake_up(&pgdat->pfmemalloc_wait); 2872 wake_up(&pgdat->pfmemalloc_wait);
2859 return false; 2873 return false;
2860 } 2874 }
2861 2875
2862 return pgdat_balanced(pgdat, order, classzone_idx); 2876 return pgdat_balanced(pgdat, order, classzone_idx);
2863 } 2877 }
2864 2878
2865 /* 2879 /*
2866 * kswapd shrinks the zone by the number of pages required to reach 2880 * kswapd shrinks the zone by the number of pages required to reach
2867 * the high watermark. 2881 * the high watermark.
2868 * 2882 *
2869 * Returns true if kswapd scanned at least the requested number of pages to 2883 * Returns true if kswapd scanned at least the requested number of pages to
2870 * reclaim or if the lack of progress was due to pages under writeback. 2884 * reclaim or if the lack of progress was due to pages under writeback.
2871 * This is used to determine if the scanning priority needs to be raised. 2885 * This is used to determine if the scanning priority needs to be raised.
2872 */ 2886 */
2873 static bool kswapd_shrink_zone(struct zone *zone, 2887 static bool kswapd_shrink_zone(struct zone *zone,
2874 int classzone_idx, 2888 int classzone_idx,
2875 struct scan_control *sc, 2889 struct scan_control *sc,
2876 unsigned long lru_pages, 2890 unsigned long lru_pages,
2877 unsigned long *nr_attempted) 2891 unsigned long *nr_attempted)
2878 { 2892 {
2879 int testorder = sc->order; 2893 int testorder = sc->order;
2880 unsigned long balance_gap; 2894 unsigned long balance_gap;
2881 struct reclaim_state *reclaim_state = current->reclaim_state; 2895 struct reclaim_state *reclaim_state = current->reclaim_state;
2882 struct shrink_control shrink = { 2896 struct shrink_control shrink = {
2883 .gfp_mask = sc->gfp_mask, 2897 .gfp_mask = sc->gfp_mask,
2884 }; 2898 };
2885 bool lowmem_pressure; 2899 bool lowmem_pressure;
2886 2900
2887 /* Reclaim above the high watermark. */ 2901 /* Reclaim above the high watermark. */
2888 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); 2902 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2889 2903
2890 /* 2904 /*
2891 * Kswapd reclaims only single pages with compaction enabled. Trying 2905 * Kswapd reclaims only single pages with compaction enabled. Trying
2892 * too hard to reclaim until contiguous free pages have become 2906 * too hard to reclaim until contiguous free pages have become
2893 * available can hurt performance by evicting too much useful data 2907 * available can hurt performance by evicting too much useful data
2894 * from memory. Do not reclaim more than needed for compaction. 2908 * from memory. Do not reclaim more than needed for compaction.
2895 */ 2909 */
2896 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2910 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2897 compaction_suitable(zone, sc->order) != 2911 compaction_suitable(zone, sc->order) !=
2898 COMPACT_SKIPPED) 2912 COMPACT_SKIPPED)
2899 testorder = 0; 2913 testorder = 0;
2900 2914
2901 /* 2915 /*
2902 * We put equal pressure on every zone, unless one zone has way too 2916 * We put equal pressure on every zone, unless one zone has way too
2903 * many pages free already. The "too many pages" is defined as the 2917 * many pages free already. The "too many pages" is defined as the
2904 * high wmark plus a "gap" where the gap is either the low 2918 * high wmark plus a "gap" where the gap is either the low
2905 * watermark or 1% of the zone, whichever is smaller. 2919 * watermark or 1% of the zone, whichever is smaller.
2906 */ 2920 */
2907 balance_gap = min(low_wmark_pages(zone), 2921 balance_gap = min(low_wmark_pages(zone),
2908 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2922 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2909 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2923 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2910 2924
2911 /* 2925 /*
2912 * If there is no low memory pressure or the zone is balanced then no 2926 * If there is no low memory pressure or the zone is balanced then no
2913 * reclaim is necessary 2927 * reclaim is necessary
2914 */ 2928 */
2915 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); 2929 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2916 if (!lowmem_pressure && zone_balanced(zone, testorder, 2930 if (!lowmem_pressure && zone_balanced(zone, testorder,
2917 balance_gap, classzone_idx)) 2931 balance_gap, classzone_idx))
2918 return true; 2932 return true;
2919 2933
2920 shrink_zone(zone, sc); 2934 shrink_zone(zone, sc);
2921 nodes_clear(shrink.nodes_to_scan); 2935 nodes_clear(shrink.nodes_to_scan);
2922 node_set(zone_to_nid(zone), shrink.nodes_to_scan); 2936 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2923 2937
2924 reclaim_state->reclaimed_slab = 0; 2938 reclaim_state->reclaimed_slab = 0;
2925 shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2939 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2926 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2940 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2927 2941
2928 /* Account for the number of pages attempted to reclaim */ 2942 /* Account for the number of pages attempted to reclaim */
2929 *nr_attempted += sc->nr_to_reclaim; 2943 *nr_attempted += sc->nr_to_reclaim;
2930 2944
2931 zone_clear_flag(zone, ZONE_WRITEBACK); 2945 zone_clear_flag(zone, ZONE_WRITEBACK);
2932 2946
2933 /* 2947 /*
2934 * If a zone reaches its high watermark, consider it to be no longer 2948 * If a zone reaches its high watermark, consider it to be no longer
2935 * congested. It's possible there are dirty pages backed by congested 2949 * congested. It's possible there are dirty pages backed by congested
2936 * BDIs but as pressure is relieved, speculatively avoid congestion 2950 * BDIs but as pressure is relieved, speculatively avoid congestion
2937 * waits. 2951 * waits.
2938 */ 2952 */
2939 if (zone_reclaimable(zone) && 2953 if (zone_reclaimable(zone) &&
2940 zone_balanced(zone, testorder, 0, classzone_idx)) { 2954 zone_balanced(zone, testorder, 0, classzone_idx)) {
2941 zone_clear_flag(zone, ZONE_CONGESTED); 2955 zone_clear_flag(zone, ZONE_CONGESTED);
2942 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2956 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2943 } 2957 }
2944 2958
2945 return sc->nr_scanned >= sc->nr_to_reclaim; 2959 return sc->nr_scanned >= sc->nr_to_reclaim;
2946 } 2960 }
2947 2961
2948 /* 2962 /*
2949 * For kswapd, balance_pgdat() will work across all this node's zones until 2963 * For kswapd, balance_pgdat() will work across all this node's zones until
2950 * they are all at high_wmark_pages(zone). 2964 * they are all at high_wmark_pages(zone).
2951 * 2965 *
2952 * Returns the final order kswapd was reclaiming at 2966 * Returns the final order kswapd was reclaiming at
2953 * 2967 *
2954 * There is special handling here for zones which are full of pinned pages. 2968 * There is special handling here for zones which are full of pinned pages.
2955 * This can happen if the pages are all mlocked, or if they are all used by 2969 * This can happen if the pages are all mlocked, or if they are all used by
2956 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2970 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
2957 * What we do is to detect the case where all pages in the zone have been 2971 * What we do is to detect the case where all pages in the zone have been
2958 * scanned twice and there has been zero successful reclaim. Mark the zone as 2972 * scanned twice and there has been zero successful reclaim. Mark the zone as
2959 * dead and from now on, only perform a short scan. Basically we're polling 2973 * dead and from now on, only perform a short scan. Basically we're polling
2960 * the zone for when the problem goes away. 2974 * the zone for when the problem goes away.
2961 * 2975 *
2962 * kswapd scans the zones in the highmem->normal->dma direction. It skips 2976 * kswapd scans the zones in the highmem->normal->dma direction. It skips
2963 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2977 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
2964 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2978 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
2965 * lower zones regardless of the number of free pages in the lower zones. This 2979 * lower zones regardless of the number of free pages in the lower zones. This
2966 * interoperates with the page allocator fallback scheme to ensure that aging 2980 * interoperates with the page allocator fallback scheme to ensure that aging
2967 * of pages is balanced across the zones. 2981 * of pages is balanced across the zones.
2968 */ 2982 */
2969 static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2983 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2970 int *classzone_idx) 2984 int *classzone_idx)
2971 { 2985 {
2972 int i; 2986 int i;
2973 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2987 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2974 unsigned long nr_soft_reclaimed; 2988 unsigned long nr_soft_reclaimed;
2975 unsigned long nr_soft_scanned; 2989 unsigned long nr_soft_scanned;
2976 struct scan_control sc = { 2990 struct scan_control sc = {
2977 .gfp_mask = GFP_KERNEL, 2991 .gfp_mask = GFP_KERNEL,
2978 .priority = DEF_PRIORITY, 2992 .priority = DEF_PRIORITY,
2979 .may_unmap = 1, 2993 .may_unmap = 1,
2980 .may_swap = 1, 2994 .may_swap = 1,
2981 .may_writepage = !laptop_mode, 2995 .may_writepage = !laptop_mode,
2982 .order = order, 2996 .order = order,
2983 .target_mem_cgroup = NULL, 2997 .target_mem_cgroup = NULL,
2984 }; 2998 };
2985 count_vm_event(PAGEOUTRUN); 2999 count_vm_event(PAGEOUTRUN);
2986 3000
2987 do { 3001 do {
2988 unsigned long lru_pages = 0; 3002 unsigned long lru_pages = 0;
2989 unsigned long nr_attempted = 0; 3003 unsigned long nr_attempted = 0;
2990 bool raise_priority = true; 3004 bool raise_priority = true;
2991 bool pgdat_needs_compaction = (order > 0); 3005 bool pgdat_needs_compaction = (order > 0);
2992 3006
2993 sc.nr_reclaimed = 0; 3007 sc.nr_reclaimed = 0;
2994 3008
2995 /* 3009 /*
2996 * Scan in the highmem->dma direction for the highest 3010 * Scan in the highmem->dma direction for the highest
2997 * zone which needs scanning 3011 * zone which needs scanning
2998 */ 3012 */
2999 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 3013 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
3000 struct zone *zone = pgdat->node_zones + i; 3014 struct zone *zone = pgdat->node_zones + i;
3001 3015
3002 if (!populated_zone(zone)) 3016 if (!populated_zone(zone))
3003 continue; 3017 continue;
3004 3018
3005 if (sc.priority != DEF_PRIORITY && 3019 if (sc.priority != DEF_PRIORITY &&
3006 !zone_reclaimable(zone)) 3020 !zone_reclaimable(zone))
3007 continue; 3021 continue;
3008 3022
3009 /* 3023 /*
3010 * Do some background aging of the anon list, to give 3024 * Do some background aging of the anon list, to give
3011 * pages a chance to be referenced before reclaiming. 3025 * pages a chance to be referenced before reclaiming.
3012 */ 3026 */
3013 age_active_anon(zone, &sc); 3027 age_active_anon(zone, &sc);
3014 3028
3015 /* 3029 /*
3016 * If the number of buffer_heads in the machine 3030 * If the number of buffer_heads in the machine
3017 * exceeds the maximum allowed level and this node 3031 * exceeds the maximum allowed level and this node
3018 * has a highmem zone, force kswapd to reclaim from 3032 * has a highmem zone, force kswapd to reclaim from
3019 * it to relieve lowmem pressure. 3033 * it to relieve lowmem pressure.
3020 */ 3034 */
3021 if (buffer_heads_over_limit && is_highmem_idx(i)) { 3035 if (buffer_heads_over_limit && is_highmem_idx(i)) {
3022 end_zone = i; 3036 end_zone = i;
3023 break; 3037 break;
3024 } 3038 }
3025 3039
3026 if (!zone_balanced(zone, order, 0, 0)) { 3040 if (!zone_balanced(zone, order, 0, 0)) {
3027 end_zone = i; 3041 end_zone = i;
3028 break; 3042 break;
3029 } else { 3043 } else {
3030 /* 3044 /*
3031 * If balanced, clear the dirty and congested 3045 * If balanced, clear the dirty and congested
3032 * flags 3046 * flags
3033 */ 3047 */
3034 zone_clear_flag(zone, ZONE_CONGESTED); 3048 zone_clear_flag(zone, ZONE_CONGESTED);
3035 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 3049 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
3036 } 3050 }
3037 } 3051 }
3038 3052
3039 if (i < 0) 3053 if (i < 0)
3040 goto out; 3054 goto out;
3041 3055
3042 for (i = 0; i <= end_zone; i++) { 3056 for (i = 0; i <= end_zone; i++) {
3043 struct zone *zone = pgdat->node_zones + i; 3057 struct zone *zone = pgdat->node_zones + i;
3044 3058
3045 if (!populated_zone(zone)) 3059 if (!populated_zone(zone))
3046 continue; 3060 continue;
3047 3061
3048 lru_pages += zone_reclaimable_pages(zone); 3062 lru_pages += zone_reclaimable_pages(zone);
3049 3063
3050 /* 3064 /*
3051 * If any zone is currently balanced then kswapd will 3065 * If any zone is currently balanced then kswapd will
3052 * not call compaction as it is expected that the 3066 * not call compaction as it is expected that the
3053 * necessary pages are already available. 3067 * necessary pages are already available.
3054 */ 3068 */
3055 if (pgdat_needs_compaction && 3069 if (pgdat_needs_compaction &&
3056 zone_watermark_ok(zone, order, 3070 zone_watermark_ok(zone, order,
3057 low_wmark_pages(zone), 3071 low_wmark_pages(zone),
3058 *classzone_idx, 0)) 3072 *classzone_idx, 0))
3059 pgdat_needs_compaction = false; 3073 pgdat_needs_compaction = false;
3060 } 3074 }
3061 3075
3062 /* 3076 /*
3063 * If we're getting trouble reclaiming, start doing writepage 3077 * If we're getting trouble reclaiming, start doing writepage
3064 * even in laptop mode. 3078 * even in laptop mode.
3065 */ 3079 */
3066 if (sc.priority < DEF_PRIORITY - 2) 3080 if (sc.priority < DEF_PRIORITY - 2)
3067 sc.may_writepage = 1; 3081 sc.may_writepage = 1;
3068 3082
3069 /* 3083 /*
3070 * Now scan the zone in the dma->highmem direction, stopping 3084 * Now scan the zone in the dma->highmem direction, stopping
3071 * at the last zone which needs scanning. 3085 * at the last zone which needs scanning.
3072 * 3086 *
3073 * We do this because the page allocator works in the opposite 3087 * We do this because the page allocator works in the opposite
3074 * direction. This prevents the page allocator from allocating 3088 * direction. This prevents the page allocator from allocating
3075 * pages behind kswapd's direction of progress, which would 3089 * pages behind kswapd's direction of progress, which would
3076 * cause too much scanning of the lower zones. 3090 * cause too much scanning of the lower zones.
3077 */ 3091 */
3078 for (i = 0; i <= end_zone; i++) { 3092 for (i = 0; i <= end_zone; i++) {
3079 struct zone *zone = pgdat->node_zones + i; 3093 struct zone *zone = pgdat->node_zones + i;
3080 3094
3081 if (!populated_zone(zone)) 3095 if (!populated_zone(zone))
3082 continue; 3096 continue;
3083 3097
3084 if (sc.priority != DEF_PRIORITY && 3098 if (sc.priority != DEF_PRIORITY &&
3085 !zone_reclaimable(zone)) 3099 !zone_reclaimable(zone))
3086 continue; 3100 continue;
3087 3101
3088 sc.nr_scanned = 0; 3102 sc.nr_scanned = 0;
3089 3103
3090 nr_soft_scanned = 0; 3104 nr_soft_scanned = 0;
3091 /* 3105 /*
3092 * Call soft limit reclaim before calling shrink_zone. 3106 * Call soft limit reclaim before calling shrink_zone.
3093 */ 3107 */
3094 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 3108 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3095 order, sc.gfp_mask, 3109 order, sc.gfp_mask,
3096 &nr_soft_scanned); 3110 &nr_soft_scanned);
3097 sc.nr_reclaimed += nr_soft_reclaimed; 3111 sc.nr_reclaimed += nr_soft_reclaimed;
3098 3112
3099 /* 3113 /*
3100 * There should be no need to raise the scanning 3114 * There should be no need to raise the scanning
3101 * priority if enough pages are already being scanned 3115 * priority if enough pages are already being scanned
3102 * that that high watermark would be met at 100% 3116 * that that high watermark would be met at 100%
3103 * efficiency. 3117 * efficiency.
3104 */ 3118 */
3105 if (kswapd_shrink_zone(zone, end_zone, &sc, 3119 if (kswapd_shrink_zone(zone, end_zone, &sc,
3106 lru_pages, &nr_attempted)) 3120 lru_pages, &nr_attempted))
3107 raise_priority = false; 3121 raise_priority = false;
3108 } 3122 }
3109 3123
3110 /* 3124 /*
3111 * If the low watermark is met there is no need for processes 3125 * If the low watermark is met there is no need for processes
3112 * to be throttled on pfmemalloc_wait as they should not be 3126 * to be throttled on pfmemalloc_wait as they should not be
3113 * able to safely make forward progress. Wake them 3127 * able to safely make forward progress. Wake them
3114 */ 3128 */
3115 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3129 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3116 pfmemalloc_watermark_ok(pgdat)) 3130 pfmemalloc_watermark_ok(pgdat))
3117 wake_up(&pgdat->pfmemalloc_wait); 3131 wake_up(&pgdat->pfmemalloc_wait);
3118 3132
3119 /* 3133 /*
3120 * Fragmentation may mean that the system cannot be rebalanced 3134 * Fragmentation may mean that the system cannot be rebalanced
3121 * for high-order allocations in all zones. If twice the 3135 * for high-order allocations in all zones. If twice the
3122 * allocation size has been reclaimed and the zones are still 3136 * allocation size has been reclaimed and the zones are still
3123 * not balanced then recheck the watermarks at order-0 to 3137 * not balanced then recheck the watermarks at order-0 to
3124 * prevent kswapd reclaiming excessively. Assume that a 3138 * prevent kswapd reclaiming excessively. Assume that a
3125 * process requested a high-order can direct reclaim/compact. 3139 * process requested a high-order can direct reclaim/compact.
3126 */ 3140 */
3127 if (order && sc.nr_reclaimed >= 2UL << order) 3141 if (order && sc.nr_reclaimed >= 2UL << order)
3128 order = sc.order = 0; 3142 order = sc.order = 0;
3129 3143
3130 /* Check if kswapd should be suspending */ 3144 /* Check if kswapd should be suspending */
3131 if (try_to_freeze() || kthread_should_stop()) 3145 if (try_to_freeze() || kthread_should_stop())
3132 break; 3146 break;
3133 3147
3134 /* 3148 /*
3135 * Compact if necessary and kswapd is reclaiming at least the 3149 * Compact if necessary and kswapd is reclaiming at least the
3136 * high watermark number of pages as requsted 3150 * high watermark number of pages as requsted
3137 */ 3151 */
3138 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) 3152 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
3139 compact_pgdat(pgdat, order); 3153 compact_pgdat(pgdat, order);
3140 3154
3141 /* 3155 /*
3142 * Raise priority if scanning rate is too low or there was no 3156 * Raise priority if scanning rate is too low or there was no
3143 * progress in reclaiming pages 3157 * progress in reclaiming pages
3144 */ 3158 */
3145 if (raise_priority || !sc.nr_reclaimed) 3159 if (raise_priority || !sc.nr_reclaimed)
3146 sc.priority--; 3160 sc.priority--;
3147 } while (sc.priority >= 1 && 3161 } while (sc.priority >= 1 &&
3148 !pgdat_balanced(pgdat, order, *classzone_idx)); 3162 !pgdat_balanced(pgdat, order, *classzone_idx));
3149 3163
3150 out: 3164 out:
3151 /* 3165 /*
3152 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3166 * Return the order we were reclaiming at so prepare_kswapd_sleep()
3153 * makes a decision on the order we were last reclaiming at. However, 3167 * makes a decision on the order we were last reclaiming at. However,
3154 * if another caller entered the allocator slow path while kswapd 3168 * if another caller entered the allocator slow path while kswapd
3155 * was awake, order will remain at the higher level 3169 * was awake, order will remain at the higher level
3156 */ 3170 */
3157 *classzone_idx = end_zone; 3171 *classzone_idx = end_zone;
3158 return order; 3172 return order;
3159 } 3173 }
3160 3174
3161 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3175 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3162 { 3176 {
3163 long remaining = 0; 3177 long remaining = 0;
3164 DEFINE_WAIT(wait); 3178 DEFINE_WAIT(wait);
3165 3179
3166 if (freezing(current) || kthread_should_stop()) 3180 if (freezing(current) || kthread_should_stop())
3167 return; 3181 return;
3168 3182
3169 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3183 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3170 3184
3171 /* Try to sleep for a short interval */ 3185 /* Try to sleep for a short interval */
3172 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3186 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3173 remaining = schedule_timeout(HZ/10); 3187 remaining = schedule_timeout(HZ/10);
3174 finish_wait(&pgdat->kswapd_wait, &wait); 3188 finish_wait(&pgdat->kswapd_wait, &wait);
3175 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3189 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3176 } 3190 }
3177 3191
3178 /* 3192 /*
3179 * After a short sleep, check if it was a premature sleep. If not, then 3193 * After a short sleep, check if it was a premature sleep. If not, then
3180 * go fully to sleep until explicitly woken up. 3194 * go fully to sleep until explicitly woken up.
3181 */ 3195 */
3182 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3196 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3183 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3197 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3184 3198
3185 /* 3199 /*
3186 * vmstat counters are not perfectly accurate and the estimated 3200 * vmstat counters are not perfectly accurate and the estimated
3187 * value for counters such as NR_FREE_PAGES can deviate from the 3201 * value for counters such as NR_FREE_PAGES can deviate from the
3188 * true value by nr_online_cpus * threshold. To avoid the zone 3202 * true value by nr_online_cpus * threshold. To avoid the zone
3189 * watermarks being breached while under pressure, we reduce the 3203 * watermarks being breached while under pressure, we reduce the
3190 * per-cpu vmstat threshold while kswapd is awake and restore 3204 * per-cpu vmstat threshold while kswapd is awake and restore
3191 * them before going back to sleep. 3205 * them before going back to sleep.
3192 */ 3206 */
3193 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 3207 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3194 3208
3195 /* 3209 /*
3196 * Compaction records what page blocks it recently failed to 3210 * Compaction records what page blocks it recently failed to
3197 * isolate pages from and skips them in the future scanning. 3211 * isolate pages from and skips them in the future scanning.
3198 * When kswapd is going to sleep, it is reasonable to assume 3212 * When kswapd is going to sleep, it is reasonable to assume
3199 * that pages and compaction may succeed so reset the cache. 3213 * that pages and compaction may succeed so reset the cache.
3200 */ 3214 */
3201 reset_isolation_suitable(pgdat); 3215 reset_isolation_suitable(pgdat);
3202 3216
3203 if (!kthread_should_stop()) 3217 if (!kthread_should_stop())
3204 schedule(); 3218 schedule();
3205 3219
3206 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 3220 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3207 } else { 3221 } else {
3208 if (remaining) 3222 if (remaining)
3209 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 3223 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3210 else 3224 else
3211 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 3225 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3212 } 3226 }
3213 finish_wait(&pgdat->kswapd_wait, &wait); 3227 finish_wait(&pgdat->kswapd_wait, &wait);
3214 } 3228 }
3215 3229
3216 /* 3230 /*
3217 * The background pageout daemon, started as a kernel thread 3231 * The background pageout daemon, started as a kernel thread
3218 * from the init process. 3232 * from the init process.
3219 * 3233 *
3220 * This basically trickles out pages so that we have _some_ 3234 * This basically trickles out pages so that we have _some_
3221 * free memory available even if there is no other activity 3235 * free memory available even if there is no other activity
3222 * that frees anything up. This is needed for things like routing 3236 * that frees anything up. This is needed for things like routing
3223 * etc, where we otherwise might have all activity going on in 3237 * etc, where we otherwise might have all activity going on in
3224 * asynchronous contexts that cannot page things out. 3238 * asynchronous contexts that cannot page things out.
3225 * 3239 *
3226 * If there are applications that are active memory-allocators 3240 * If there are applications that are active memory-allocators
3227 * (most normal use), this basically shouldn't matter. 3241 * (most normal use), this basically shouldn't matter.
3228 */ 3242 */
3229 static int kswapd(void *p) 3243 static int kswapd(void *p)
3230 { 3244 {
3231 unsigned long order, new_order; 3245 unsigned long order, new_order;
3232 unsigned balanced_order; 3246 unsigned balanced_order;
3233 int classzone_idx, new_classzone_idx; 3247 int classzone_idx, new_classzone_idx;
3234 int balanced_classzone_idx; 3248 int balanced_classzone_idx;
3235 pg_data_t *pgdat = (pg_data_t*)p; 3249 pg_data_t *pgdat = (pg_data_t*)p;
3236 struct task_struct *tsk = current; 3250 struct task_struct *tsk = current;
3237 3251
3238 struct reclaim_state reclaim_state = { 3252 struct reclaim_state reclaim_state = {
3239 .reclaimed_slab = 0, 3253 .reclaimed_slab = 0,
3240 }; 3254 };
3241 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3255 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3242 3256
3243 lockdep_set_current_reclaim_state(GFP_KERNEL); 3257 lockdep_set_current_reclaim_state(GFP_KERNEL);
3244 3258
3245 if (!cpumask_empty(cpumask)) 3259 if (!cpumask_empty(cpumask))
3246 set_cpus_allowed_ptr(tsk, cpumask); 3260 set_cpus_allowed_ptr(tsk, cpumask);
3247 current->reclaim_state = &reclaim_state; 3261 current->reclaim_state = &reclaim_state;
3248 3262
3249 /* 3263 /*
3250 * Tell the memory management that we're a "memory allocator", 3264 * Tell the memory management that we're a "memory allocator",
3251 * and that if we need more memory we should get access to it 3265 * and that if we need more memory we should get access to it
3252 * regardless (see "__alloc_pages()"). "kswapd" should 3266 * regardless (see "__alloc_pages()"). "kswapd" should
3253 * never get caught in the normal page freeing logic. 3267 * never get caught in the normal page freeing logic.
3254 * 3268 *
3255 * (Kswapd normally doesn't need memory anyway, but sometimes 3269 * (Kswapd normally doesn't need memory anyway, but sometimes
3256 * you need a small amount of memory in order to be able to 3270 * you need a small amount of memory in order to be able to
3257 * page out something else, and this flag essentially protects 3271 * page out something else, and this flag essentially protects
3258 * us from recursively trying to free more memory as we're 3272 * us from recursively trying to free more memory as we're
3259 * trying to free the first piece of memory in the first place). 3273 * trying to free the first piece of memory in the first place).
3260 */ 3274 */
3261 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3275 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3262 set_freezable(); 3276 set_freezable();
3263 3277
3264 order = new_order = 0; 3278 order = new_order = 0;
3265 balanced_order = 0; 3279 balanced_order = 0;
3266 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3280 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3267 balanced_classzone_idx = classzone_idx; 3281 balanced_classzone_idx = classzone_idx;
3268 for ( ; ; ) { 3282 for ( ; ; ) {
3269 bool ret; 3283 bool ret;
3270 3284
3271 /* 3285 /*
3272 * If the last balance_pgdat was unsuccessful it's unlikely a 3286 * If the last balance_pgdat was unsuccessful it's unlikely a
3273 * new request of a similar or harder type will succeed soon 3287 * new request of a similar or harder type will succeed soon
3274 * so consider going to sleep on the basis we reclaimed at 3288 * so consider going to sleep on the basis we reclaimed at
3275 */ 3289 */
3276 if (balanced_classzone_idx >= new_classzone_idx && 3290 if (balanced_classzone_idx >= new_classzone_idx &&
3277 balanced_order == new_order) { 3291 balanced_order == new_order) {
3278 new_order = pgdat->kswapd_max_order; 3292 new_order = pgdat->kswapd_max_order;
3279 new_classzone_idx = pgdat->classzone_idx; 3293 new_classzone_idx = pgdat->classzone_idx;
3280 pgdat->kswapd_max_order = 0; 3294 pgdat->kswapd_max_order = 0;
3281 pgdat->classzone_idx = pgdat->nr_zones - 1; 3295 pgdat->classzone_idx = pgdat->nr_zones - 1;
3282 } 3296 }
3283 3297
3284 if (order < new_order || classzone_idx > new_classzone_idx) { 3298 if (order < new_order || classzone_idx > new_classzone_idx) {
3285 /* 3299 /*
3286 * Don't sleep if someone wants a larger 'order' 3300 * Don't sleep if someone wants a larger 'order'
3287 * allocation or has tigher zone constraints 3301 * allocation or has tigher zone constraints
3288 */ 3302 */
3289 order = new_order; 3303 order = new_order;
3290 classzone_idx = new_classzone_idx; 3304 classzone_idx = new_classzone_idx;
3291 } else { 3305 } else {
3292 kswapd_try_to_sleep(pgdat, balanced_order, 3306 kswapd_try_to_sleep(pgdat, balanced_order,
3293 balanced_classzone_idx); 3307 balanced_classzone_idx);
3294 order = pgdat->kswapd_max_order; 3308 order = pgdat->kswapd_max_order;
3295 classzone_idx = pgdat->classzone_idx; 3309 classzone_idx = pgdat->classzone_idx;
3296 new_order = order; 3310 new_order = order;
3297 new_classzone_idx = classzone_idx; 3311 new_classzone_idx = classzone_idx;
3298 pgdat->kswapd_max_order = 0; 3312 pgdat->kswapd_max_order = 0;
3299 pgdat->classzone_idx = pgdat->nr_zones - 1; 3313 pgdat->classzone_idx = pgdat->nr_zones - 1;
3300 } 3314 }
3301 3315
3302 ret = try_to_freeze(); 3316 ret = try_to_freeze();
3303 if (kthread_should_stop()) 3317 if (kthread_should_stop())
3304 break; 3318 break;
3305 3319
3306 /* 3320 /*
3307 * We can speed up thawing tasks if we don't call balance_pgdat 3321 * We can speed up thawing tasks if we don't call balance_pgdat
3308 * after returning from the refrigerator 3322 * after returning from the refrigerator
3309 */ 3323 */
3310 if (!ret) { 3324 if (!ret) {
3311 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3325 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3312 balanced_classzone_idx = classzone_idx; 3326 balanced_classzone_idx = classzone_idx;
3313 balanced_order = balance_pgdat(pgdat, order, 3327 balanced_order = balance_pgdat(pgdat, order,
3314 &balanced_classzone_idx); 3328 &balanced_classzone_idx);
3315 } 3329 }
3316 } 3330 }
3317 3331
3318 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3332 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3319 current->reclaim_state = NULL; 3333 current->reclaim_state = NULL;
3320 lockdep_clear_current_reclaim_state(); 3334 lockdep_clear_current_reclaim_state();
3321 3335
3322 return 0; 3336 return 0;
3323 } 3337 }
3324 3338
3325 /* 3339 /*
3326 * A zone is low on free memory, so wake its kswapd task to service it. 3340 * A zone is low on free memory, so wake its kswapd task to service it.
3327 */ 3341 */
3328 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3342 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3329 { 3343 {
3330 pg_data_t *pgdat; 3344 pg_data_t *pgdat;
3331 3345
3332 if (!populated_zone(zone)) 3346 if (!populated_zone(zone))
3333 return; 3347 return;
3334 3348
3335 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 3349 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3336 return; 3350 return;
3337 pgdat = zone->zone_pgdat; 3351 pgdat = zone->zone_pgdat;
3338 if (pgdat->kswapd_max_order < order) { 3352 if (pgdat->kswapd_max_order < order) {
3339 pgdat->kswapd_max_order = order; 3353 pgdat->kswapd_max_order = order;
3340 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 3354 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3341 } 3355 }
3342 if (!waitqueue_active(&pgdat->kswapd_wait)) 3356 if (!waitqueue_active(&pgdat->kswapd_wait))
3343 return; 3357 return;
3344 if (zone_balanced(zone, order, 0, 0)) 3358 if (zone_balanced(zone, order, 0, 0))
3345 return; 3359 return;
3346 3360
3347 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3361 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3348 wake_up_interruptible(&pgdat->kswapd_wait); 3362 wake_up_interruptible(&pgdat->kswapd_wait);
3349 } 3363 }
3350 3364
3351 #ifdef CONFIG_HIBERNATION 3365 #ifdef CONFIG_HIBERNATION
3352 /* 3366 /*
3353 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3367 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
3354 * freed pages. 3368 * freed pages.
3355 * 3369 *
3356 * Rather than trying to age LRUs the aim is to preserve the overall 3370 * Rather than trying to age LRUs the aim is to preserve the overall
3357 * LRU order by reclaiming preferentially 3371 * LRU order by reclaiming preferentially
3358 * inactive > active > active referenced > active mapped 3372 * inactive > active > active referenced > active mapped
3359 */ 3373 */
3360 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3374 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3361 { 3375 {
3362 struct reclaim_state reclaim_state; 3376 struct reclaim_state reclaim_state;
3363 struct scan_control sc = { 3377 struct scan_control sc = {
3364 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3378 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3365 .may_swap = 1, 3379 .may_swap = 1,
3366 .may_unmap = 1, 3380 .may_unmap = 1,
3367 .may_writepage = 1, 3381 .may_writepage = 1,
3368 .nr_to_reclaim = nr_to_reclaim, 3382 .nr_to_reclaim = nr_to_reclaim,
3369 .hibernation_mode = 1, 3383 .hibernation_mode = 1,
3370 .order = 0, 3384 .order = 0,
3371 .priority = DEF_PRIORITY, 3385 .priority = DEF_PRIORITY,
3372 }; 3386 };
3373 struct shrink_control shrink = { 3387 struct shrink_control shrink = {
3374 .gfp_mask = sc.gfp_mask, 3388 .gfp_mask = sc.gfp_mask,
3375 }; 3389 };
3376 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3390 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3377 struct task_struct *p = current; 3391 struct task_struct *p = current;
3378 unsigned long nr_reclaimed; 3392 unsigned long nr_reclaimed;
3379 3393
3380 p->flags |= PF_MEMALLOC; 3394 p->flags |= PF_MEMALLOC;
3381 lockdep_set_current_reclaim_state(sc.gfp_mask); 3395 lockdep_set_current_reclaim_state(sc.gfp_mask);
3382 reclaim_state.reclaimed_slab = 0; 3396 reclaim_state.reclaimed_slab = 0;
3383 p->reclaim_state = &reclaim_state; 3397 p->reclaim_state = &reclaim_state;
3384 3398
3385 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 3399 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3386 3400
3387 p->reclaim_state = NULL; 3401 p->reclaim_state = NULL;
3388 lockdep_clear_current_reclaim_state(); 3402 lockdep_clear_current_reclaim_state();
3389 p->flags &= ~PF_MEMALLOC; 3403 p->flags &= ~PF_MEMALLOC;
3390 3404
3391 return nr_reclaimed; 3405 return nr_reclaimed;
3392 } 3406 }
3393 #endif /* CONFIG_HIBERNATION */ 3407 #endif /* CONFIG_HIBERNATION */
3394 3408
3395 /* It's optimal to keep kswapds on the same CPUs as their memory, but 3409 /* It's optimal to keep kswapds on the same CPUs as their memory, but
3396 not required for correctness. So if the last cpu in a node goes 3410 not required for correctness. So if the last cpu in a node goes
3397 away, we get changed to run anywhere: as the first one comes back, 3411 away, we get changed to run anywhere: as the first one comes back,
3398 restore their cpu bindings. */ 3412 restore their cpu bindings. */
3399 static int cpu_callback(struct notifier_block *nfb, unsigned long action, 3413 static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3400 void *hcpu) 3414 void *hcpu)
3401 { 3415 {
3402 int nid; 3416 int nid;
3403 3417
3404 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3418 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3405 for_each_node_state(nid, N_MEMORY) { 3419 for_each_node_state(nid, N_MEMORY) {
3406 pg_data_t *pgdat = NODE_DATA(nid); 3420 pg_data_t *pgdat = NODE_DATA(nid);
3407 const struct cpumask *mask; 3421 const struct cpumask *mask;
3408 3422
3409 mask = cpumask_of_node(pgdat->node_id); 3423 mask = cpumask_of_node(pgdat->node_id);
3410 3424
3411 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 3425 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3412 /* One of our CPUs online: restore mask */ 3426 /* One of our CPUs online: restore mask */
3413 set_cpus_allowed_ptr(pgdat->kswapd, mask); 3427 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3414 } 3428 }
3415 } 3429 }
3416 return NOTIFY_OK; 3430 return NOTIFY_OK;
3417 } 3431 }
3418 3432
3419 /* 3433 /*
3420 * This kswapd start function will be called by init and node-hot-add. 3434 * This kswapd start function will be called by init and node-hot-add.
3421 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3435 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
3422 */ 3436 */
3423 int kswapd_run(int nid) 3437 int kswapd_run(int nid)
3424 { 3438 {
3425 pg_data_t *pgdat = NODE_DATA(nid); 3439 pg_data_t *pgdat = NODE_DATA(nid);
3426 int ret = 0; 3440 int ret = 0;
3427 3441
3428 if (pgdat->kswapd) 3442 if (pgdat->kswapd)
3429 return 0; 3443 return 0;
3430 3444
3431 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 3445 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3432 if (IS_ERR(pgdat->kswapd)) { 3446 if (IS_ERR(pgdat->kswapd)) {
3433 /* failure at boot is fatal */ 3447 /* failure at boot is fatal */
3434 BUG_ON(system_state == SYSTEM_BOOTING); 3448 BUG_ON(system_state == SYSTEM_BOOTING);
3435 pr_err("Failed to start kswapd on node %d\n", nid); 3449 pr_err("Failed to start kswapd on node %d\n", nid);
3436 ret = PTR_ERR(pgdat->kswapd); 3450 ret = PTR_ERR(pgdat->kswapd);
3437 pgdat->kswapd = NULL; 3451 pgdat->kswapd = NULL;
3438 } 3452 }
3439 return ret; 3453 return ret;
3440 } 3454 }
3441 3455
3442 /* 3456 /*
3443 * Called by memory hotplug when all memory in a node is offlined. Caller must 3457 * Called by memory hotplug when all memory in a node is offlined. Caller must
3444 * hold lock_memory_hotplug(). 3458 * hold lock_memory_hotplug().
3445 */ 3459 */
3446 void kswapd_stop(int nid) 3460 void kswapd_stop(int nid)
3447 { 3461 {
3448 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3462 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3449 3463
3450 if (kswapd) { 3464 if (kswapd) {
3451 kthread_stop(kswapd); 3465 kthread_stop(kswapd);
3452 NODE_DATA(nid)->kswapd = NULL; 3466 NODE_DATA(nid)->kswapd = NULL;
3453 } 3467 }
3454 } 3468 }
3455 3469
3456 static int __init kswapd_init(void) 3470 static int __init kswapd_init(void)
3457 { 3471 {
3458 int nid; 3472 int nid;
3459 3473
3460 swap_setup(); 3474 swap_setup();
3461 for_each_node_state(nid, N_MEMORY) 3475 for_each_node_state(nid, N_MEMORY)
3462 kswapd_run(nid); 3476 kswapd_run(nid);
3463 hotcpu_notifier(cpu_callback, 0); 3477 hotcpu_notifier(cpu_callback, 0);
3464 return 0; 3478 return 0;
3465 } 3479 }
3466 3480
3467 module_init(kswapd_init) 3481 module_init(kswapd_init)
3468 3482
3469 #ifdef CONFIG_NUMA 3483 #ifdef CONFIG_NUMA
3470 /* 3484 /*
3471 * Zone reclaim mode 3485 * Zone reclaim mode
3472 * 3486 *
3473 * If non-zero call zone_reclaim when the number of free pages falls below 3487 * If non-zero call zone_reclaim when the number of free pages falls below
3474 * the watermarks. 3488 * the watermarks.
3475 */ 3489 */
3476 int zone_reclaim_mode __read_mostly; 3490 int zone_reclaim_mode __read_mostly;
3477 3491
3478 #define RECLAIM_OFF 0 3492 #define RECLAIM_OFF 0
3479 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3493 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
3480 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3494 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
3481 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3495 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
3482 3496
3483 /* 3497 /*
3484 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3498 * Priority for ZONE_RECLAIM. This determines the fraction of pages
3485 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3499 * of a node considered for each zone_reclaim. 4 scans 1/16th of
3486 * a zone. 3500 * a zone.
3487 */ 3501 */
3488 #define ZONE_RECLAIM_PRIORITY 4 3502 #define ZONE_RECLAIM_PRIORITY 4
3489 3503
3490 /* 3504 /*
3491 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3505 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
3492 * occur. 3506 * occur.
3493 */ 3507 */
3494 int sysctl_min_unmapped_ratio = 1; 3508 int sysctl_min_unmapped_ratio = 1;
3495 3509
3496 /* 3510 /*
3497 * If the number of slab pages in a zone grows beyond this percentage then 3511 * If the number of slab pages in a zone grows beyond this percentage then
3498 * slab reclaim needs to occur. 3512 * slab reclaim needs to occur.
3499 */ 3513 */
3500 int sysctl_min_slab_ratio = 5; 3514 int sysctl_min_slab_ratio = 5;
3501 3515
3502 static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3516 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3503 { 3517 {
3504 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3518 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3505 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3519 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3506 zone_page_state(zone, NR_ACTIVE_FILE); 3520 zone_page_state(zone, NR_ACTIVE_FILE);
3507 3521
3508 /* 3522 /*
3509 * It's possible for there to be more file mapped pages than 3523 * It's possible for there to be more file mapped pages than
3510 * accounted for by the pages on the file LRU lists because 3524 * accounted for by the pages on the file LRU lists because
3511 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3525 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
3512 */ 3526 */
3513 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3527 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3514 } 3528 }
3515 3529
3516 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3530 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
3517 static long zone_pagecache_reclaimable(struct zone *zone) 3531 static long zone_pagecache_reclaimable(struct zone *zone)
3518 { 3532 {
3519 long nr_pagecache_reclaimable; 3533 long nr_pagecache_reclaimable;
3520 long delta = 0; 3534 long delta = 0;
3521 3535
3522 /* 3536 /*
3523 * If RECLAIM_SWAP is set, then all file pages are considered 3537 * If RECLAIM_SWAP is set, then all file pages are considered
3524 * potentially reclaimable. Otherwise, we have to worry about 3538 * potentially reclaimable. Otherwise, we have to worry about
3525 * pages like swapcache and zone_unmapped_file_pages() provides 3539 * pages like swapcache and zone_unmapped_file_pages() provides
3526 * a better estimate 3540 * a better estimate
3527 */ 3541 */
3528 if (zone_reclaim_mode & RECLAIM_SWAP) 3542 if (zone_reclaim_mode & RECLAIM_SWAP)
3529 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3543 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3530 else 3544 else
3531 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3545 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3532 3546
3533 /* If we can't clean pages, remove dirty pages from consideration */ 3547 /* If we can't clean pages, remove dirty pages from consideration */
3534 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3548 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3535 delta += zone_page_state(zone, NR_FILE_DIRTY); 3549 delta += zone_page_state(zone, NR_FILE_DIRTY);
3536 3550
3537 /* Watch for any possible underflows due to delta */ 3551 /* Watch for any possible underflows due to delta */
3538 if (unlikely(delta > nr_pagecache_reclaimable)) 3552 if (unlikely(delta > nr_pagecache_reclaimable))
3539 delta = nr_pagecache_reclaimable; 3553 delta = nr_pagecache_reclaimable;
3540 3554
3541 return nr_pagecache_reclaimable - delta; 3555 return nr_pagecache_reclaimable - delta;
3542 } 3556 }
3543 3557
3544 /* 3558 /*
3545 * Try to free up some pages from this zone through reclaim. 3559 * Try to free up some pages from this zone through reclaim.
3546 */ 3560 */
3547 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3561 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3548 { 3562 {
3549 /* Minimum pages needed in order to stay on node */ 3563 /* Minimum pages needed in order to stay on node */
3550 const unsigned long nr_pages = 1 << order; 3564 const unsigned long nr_pages = 1 << order;
3551 struct task_struct *p = current; 3565 struct task_struct *p = current;
3552 struct reclaim_state reclaim_state; 3566 struct reclaim_state reclaim_state;
3553 struct scan_control sc = { 3567 struct scan_control sc = {
3554 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3568 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3555 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3569 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3556 .may_swap = 1, 3570 .may_swap = 1,
3557 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3571 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3558 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 3572 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3559 .order = order, 3573 .order = order,
3560 .priority = ZONE_RECLAIM_PRIORITY, 3574 .priority = ZONE_RECLAIM_PRIORITY,
3561 }; 3575 };
3562 struct shrink_control shrink = { 3576 struct shrink_control shrink = {
3563 .gfp_mask = sc.gfp_mask, 3577 .gfp_mask = sc.gfp_mask,
3564 }; 3578 };
3565 unsigned long nr_slab_pages0, nr_slab_pages1; 3579 unsigned long nr_slab_pages0, nr_slab_pages1;
3566 3580
3567 cond_resched(); 3581 cond_resched();
3568 /* 3582 /*
3569 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3583 * We need to be able to allocate from the reserves for RECLAIM_SWAP
3570 * and we also need to be able to write out pages for RECLAIM_WRITE 3584 * and we also need to be able to write out pages for RECLAIM_WRITE
3571 * and RECLAIM_SWAP. 3585 * and RECLAIM_SWAP.
3572 */ 3586 */
3573 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3587 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3574 lockdep_set_current_reclaim_state(gfp_mask); 3588 lockdep_set_current_reclaim_state(gfp_mask);
3575 reclaim_state.reclaimed_slab = 0; 3589 reclaim_state.reclaimed_slab = 0;
3576 p->reclaim_state = &reclaim_state; 3590 p->reclaim_state = &reclaim_state;
3577 3591
3578 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3592 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3579 /* 3593 /*
3580 * Free memory by calling shrink zone with increasing 3594 * Free memory by calling shrink zone with increasing
3581 * priorities until we have enough memory freed. 3595 * priorities until we have enough memory freed.
3582 */ 3596 */
3583 do { 3597 do {
3584 shrink_zone(zone, &sc); 3598 shrink_zone(zone, &sc);
3585 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3599 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3586 } 3600 }
3587 3601
3588 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3602 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3589 if (nr_slab_pages0 > zone->min_slab_pages) { 3603 if (nr_slab_pages0 > zone->min_slab_pages) {
3590 /* 3604 /*
3591 * shrink_slab() does not currently allow us to determine how 3605 * shrink_slab() does not currently allow us to determine how
3592 * many pages were freed in this zone. So we take the current 3606 * many pages were freed in this zone. So we take the current
3593 * number of slab pages and shake the slab until it is reduced 3607 * number of slab pages and shake the slab until it is reduced
3594 * by the same nr_pages that we used for reclaiming unmapped 3608 * by the same nr_pages that we used for reclaiming unmapped
3595 * pages. 3609 * pages.
3596 */ 3610 */
3597 nodes_clear(shrink.nodes_to_scan); 3611 nodes_clear(shrink.nodes_to_scan);
3598 node_set(zone_to_nid(zone), shrink.nodes_to_scan); 3612 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3599 for (;;) { 3613 for (;;) {
3600 unsigned long lru_pages = zone_reclaimable_pages(zone); 3614 unsigned long lru_pages = zone_reclaimable_pages(zone);
3601 3615
3602 /* No reclaimable slab or very low memory pressure */ 3616 /* No reclaimable slab or very low memory pressure */
3603 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3617 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3604 break; 3618 break;
3605 3619
3606 /* Freed enough memory */ 3620 /* Freed enough memory */
3607 nr_slab_pages1 = zone_page_state(zone, 3621 nr_slab_pages1 = zone_page_state(zone,
3608 NR_SLAB_RECLAIMABLE); 3622 NR_SLAB_RECLAIMABLE);
3609 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3623 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3610 break; 3624 break;
3611 } 3625 }
3612 3626
3613 /* 3627 /*
3614 * Update nr_reclaimed by the number of slab pages we 3628 * Update nr_reclaimed by the number of slab pages we
3615 * reclaimed from this zone. 3629 * reclaimed from this zone.
3616 */ 3630 */
3617 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3631 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3618 if (nr_slab_pages1 < nr_slab_pages0) 3632 if (nr_slab_pages1 < nr_slab_pages0)
3619 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3633 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3620 } 3634 }
3621 3635
3622 p->reclaim_state = NULL; 3636 p->reclaim_state = NULL;
3623 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3637 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3624 lockdep_clear_current_reclaim_state(); 3638 lockdep_clear_current_reclaim_state();
3625 return sc.nr_reclaimed >= nr_pages; 3639 return sc.nr_reclaimed >= nr_pages;
3626 } 3640 }
3627 3641
3628 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3642 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3629 { 3643 {
3630 int node_id; 3644 int node_id;
3631 int ret; 3645 int ret;
3632 3646
3633 /* 3647 /*
3634 * Zone reclaim reclaims unmapped file backed pages and 3648 * Zone reclaim reclaims unmapped file backed pages and
3635 * slab pages if we are over the defined limits. 3649 * slab pages if we are over the defined limits.
3636 * 3650 *
3637 * A small portion of unmapped file backed pages is needed for 3651 * A small portion of unmapped file backed pages is needed for
3638 * file I/O otherwise pages read by file I/O will be immediately 3652 * file I/O otherwise pages read by file I/O will be immediately
3639 * thrown out if the zone is overallocated. So we do not reclaim 3653 * thrown out if the zone is overallocated. So we do not reclaim
3640 * if less than a specified percentage of the zone is used by 3654 * if less than a specified percentage of the zone is used by
3641 * unmapped file backed pages. 3655 * unmapped file backed pages.
3642 */ 3656 */
3643 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3657 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3644 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3658 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3645 return ZONE_RECLAIM_FULL; 3659 return ZONE_RECLAIM_FULL;
3646 3660
3647 if (!zone_reclaimable(zone)) 3661 if (!zone_reclaimable(zone))
3648 return ZONE_RECLAIM_FULL; 3662 return ZONE_RECLAIM_FULL;
3649 3663
3650 /* 3664 /*
3651 * Do not scan if the allocation should not be delayed. 3665 * Do not scan if the allocation should not be delayed.
3652 */ 3666 */
3653 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3667 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3654 return ZONE_RECLAIM_NOSCAN; 3668 return ZONE_RECLAIM_NOSCAN;
3655 3669
3656 /* 3670 /*
3657 * Only run zone reclaim on the local zone or on zones that do not 3671 * Only run zone reclaim on the local zone or on zones that do not
3658 * have associated processors. This will favor the local processor 3672 * have associated processors. This will favor the local processor
3659 * over remote processors and spread off node memory allocations 3673 * over remote processors and spread off node memory allocations
3660 * as wide as possible. 3674 * as wide as possible.
3661 */ 3675 */
3662 node_id = zone_to_nid(zone); 3676 node_id = zone_to_nid(zone);
3663 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3677 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3664 return ZONE_RECLAIM_NOSCAN; 3678 return ZONE_RECLAIM_NOSCAN;
3665 3679
3666 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3680 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3667 return ZONE_RECLAIM_NOSCAN; 3681 return ZONE_RECLAIM_NOSCAN;
3668 3682
3669 ret = __zone_reclaim(zone, gfp_mask, order); 3683 ret = __zone_reclaim(zone, gfp_mask, order);
3670 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3684 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3671 3685
3672 if (!ret) 3686 if (!ret)
3673 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3687 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3674 3688
3675 return ret; 3689 return ret;
3676 } 3690 }
3677 #endif 3691 #endif
3678 3692
3679 /* 3693 /*
3680 * page_evictable - test whether a page is evictable 3694 * page_evictable - test whether a page is evictable
3681 * @page: the page to test 3695 * @page: the page to test
3682 * 3696 *
3683 * Test whether page is evictable--i.e., should be placed on active/inactive 3697 * Test whether page is evictable--i.e., should be placed on active/inactive
3684 * lists vs unevictable list. 3698 * lists vs unevictable list.
3685 * 3699 *
3686 * Reasons page might not be evictable: 3700 * Reasons page might not be evictable:
3687 * (1) page's mapping marked unevictable 3701 * (1) page's mapping marked unevictable
3688 * (2) page is part of an mlocked VMA 3702 * (2) page is part of an mlocked VMA
3689 * 3703 *
3690 */ 3704 */
3691 int page_evictable(struct page *page) 3705 int page_evictable(struct page *page)
3692 { 3706 {
3693 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); 3707 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3694 } 3708 }
3695 3709
3696 #ifdef CONFIG_SHMEM 3710 #ifdef CONFIG_SHMEM
3697 /** 3711 /**
3698 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list 3712 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
3699 * @pages: array of pages to check 3713 * @pages: array of pages to check
3700 * @nr_pages: number of pages to check 3714 * @nr_pages: number of pages to check
3701 * 3715 *
3702 * Checks pages for evictability and moves them to the appropriate lru list. 3716 * Checks pages for evictability and moves them to the appropriate lru list.
3703 * 3717 *
3704 * This function is only used for SysV IPC SHM_UNLOCK. 3718 * This function is only used for SysV IPC SHM_UNLOCK.
3705 */ 3719 */
3706 void check_move_unevictable_pages(struct page **pages, int nr_pages) 3720 void check_move_unevictable_pages(struct page **pages, int nr_pages)
3707 { 3721 {
3708 struct lruvec *lruvec; 3722 struct lruvec *lruvec;
3709 struct zone *zone = NULL; 3723 struct zone *zone = NULL;
3710 int pgscanned = 0; 3724 int pgscanned = 0;
3711 int pgrescued = 0; 3725 int pgrescued = 0;
3712 int i; 3726 int i;
3713 3727
3714 for (i = 0; i < nr_pages; i++) { 3728 for (i = 0; i < nr_pages; i++) {
3715 struct page *page = pages[i]; 3729 struct page *page = pages[i];
3716 struct zone *pagezone; 3730 struct zone *pagezone;
3717 3731
3718 pgscanned++; 3732 pgscanned++;
3719 pagezone = page_zone(page); 3733 pagezone = page_zone(page);
3720 if (pagezone != zone) { 3734 if (pagezone != zone) {
3721 if (zone) 3735 if (zone)
3722 spin_unlock_irq(&zone->lru_lock); 3736 spin_unlock_irq(&zone->lru_lock);
3723 zone = pagezone; 3737 zone = pagezone;
3724 spin_lock_irq(&zone->lru_lock); 3738 spin_lock_irq(&zone->lru_lock);
3725 } 3739 }
3726 lruvec = mem_cgroup_page_lruvec(page, zone); 3740 lruvec = mem_cgroup_page_lruvec(page, zone);
3727 3741
3728 if (!PageLRU(page) || !PageUnevictable(page)) 3742 if (!PageLRU(page) || !PageUnevictable(page))
3729 continue; 3743 continue;
3730 3744
3731 if (page_evictable(page)) { 3745 if (page_evictable(page)) {
3732 enum lru_list lru = page_lru_base_type(page); 3746 enum lru_list lru = page_lru_base_type(page);
3733 3747
3734 VM_BUG_ON(PageActive(page)); 3748 VM_BUG_ON(PageActive(page));
3735 ClearPageUnevictable(page); 3749 ClearPageUnevictable(page);
3736 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); 3750 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3737 add_page_to_lru_list(page, lruvec, lru); 3751 add_page_to_lru_list(page, lruvec, lru);
3738 pgrescued++; 3752 pgrescued++;
3739 } 3753 }
3740 } 3754 }
3741 3755
3742 if (zone) { 3756 if (zone) {
3743 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 3757 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3744 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 3758 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3745 spin_unlock_irq(&zone->lru_lock); 3759 spin_unlock_irq(&zone->lru_lock);
3746 } 3760 }
3747 } 3761 }
3748 #endif /* CONFIG_SHMEM */ 3762 #endif /* CONFIG_SHMEM */
3749 3763
3750 static void warn_scan_unevictable_pages(void) 3764 static void warn_scan_unevictable_pages(void)
3751 { 3765 {
3752 printk_once(KERN_WARNING 3766 printk_once(KERN_WARNING
3753 "%s: The scan_unevictable_pages sysctl/node-interface has been " 3767 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3754 "disabled for lack of a legitimate use case. If you have " 3768 "disabled for lack of a legitimate use case. If you have "
3755 "one, please send an email to linux-mm@kvack.org.\n", 3769 "one, please send an email to linux-mm@kvack.org.\n",
3756 current->comm); 3770 current->comm);
3757 } 3771 }
3758 3772
3759 /* 3773 /*
3760 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3774 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
3761 * all nodes' unevictable lists for evictable pages 3775 * all nodes' unevictable lists for evictable pages
3762 */ 3776 */
3763 unsigned long scan_unevictable_pages; 3777 unsigned long scan_unevictable_pages;
3764 3778
3765 int scan_unevictable_handler(struct ctl_table *table, int write, 3779 int scan_unevictable_handler(struct ctl_table *table, int write,
3766 void __user *buffer, 3780 void __user *buffer,
3767 size_t *length, loff_t *ppos) 3781 size_t *length, loff_t *ppos)
3768 { 3782 {
3769 warn_scan_unevictable_pages(); 3783 warn_scan_unevictable_pages();
3770 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3784 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3771 scan_unevictable_pages = 0; 3785 scan_unevictable_pages = 0;
3772 return 0; 3786 return 0;
3773 } 3787 }
3774 3788
3775 #ifdef CONFIG_NUMA 3789 #ifdef CONFIG_NUMA
3776 /* 3790 /*
3777 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3791 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
3778 * a specified node's per zone unevictable lists for evictable pages. 3792 * a specified node's per zone unevictable lists for evictable pages.
3779 */ 3793 */
3780 3794
3781 static ssize_t read_scan_unevictable_node(struct device *dev, 3795 static ssize_t read_scan_unevictable_node(struct device *dev,
3782 struct device_attribute *attr, 3796 struct device_attribute *attr,
3783 char *buf) 3797 char *buf)
3784 { 3798 {
3785 warn_scan_unevictable_pages(); 3799 warn_scan_unevictable_pages();
3786 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3800 return sprintf(buf, "0\n"); /* always zero; should fit... */
3787 } 3801 }
3788 3802
3789 static ssize_t write_scan_unevictable_node(struct device *dev, 3803 static ssize_t write_scan_unevictable_node(struct device *dev,
3790 struct device_attribute *attr, 3804 struct device_attribute *attr,
3791 const char *buf, size_t count) 3805 const char *buf, size_t count)
3792 { 3806 {
3793 warn_scan_unevictable_pages(); 3807 warn_scan_unevictable_pages();
3794 return 1; 3808 return 1;
3795 } 3809 }
3796 3810
3797 3811
3798 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3812 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3799 read_scan_unevictable_node, 3813 read_scan_unevictable_node,
3800 write_scan_unevictable_node); 3814 write_scan_unevictable_node);
3801 3815
3802 int scan_unevictable_register_node(struct node *node) 3816 int scan_unevictable_register_node(struct node *node)
3803 { 3817 {