Commit 04bab05a95fece32015d897d4058880bbb5c65eb

Authored by Jerome Marchand
Committed by Jiri Slaby
1 parent 788a2f69f9

memcg, vmscan: Fix forced scan of anonymous pages

commit 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e upstream.

When memory cgoups are enabled, the code that decides to force to scan
anonymous pages in get_scan_count() compares global values (free,
high_watermark) to a value that is restricted to a memory cgroup (file).
It make the code over-eager to force anon scan.

For instance, it will force anon scan when scanning a memcg that is
mainly populated by anonymous page, even when there is plenty of file
pages to get rid of in others memcgs, even when swappiness == 0.  It
breaks user's expectation about swappiness and hurts performance.

This patch makes sure that forced anon scan only happens when there not
enough file pages for the all zone, not just in one random memcg.

[hannes@cmpxchg.org: cleanups]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 15 additions and 8 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/vmscan.c 2 * linux/mm/vmscan.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * 5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct 7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 * Multiqueue VM started 5.8.00, Rik van Riel. 11 * Multiqueue VM started 5.8.00, Rik van Riel.
12 */ 12 */
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/gfp.h> 16 #include <linux/gfp.h>
17 #include <linux/kernel_stat.h> 17 #include <linux/kernel_stat.h>
18 #include <linux/swap.h> 18 #include <linux/swap.h>
19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h>
20 #include <linux/init.h> 20 #include <linux/init.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/vmpressure.h> 22 #include <linux/vmpressure.h>
23 #include <linux/vmstat.h> 23 #include <linux/vmstat.h>
24 #include <linux/file.h> 24 #include <linux/file.h>
25 #include <linux/writeback.h> 25 #include <linux/writeback.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/buffer_head.h> /* for try_to_release_page(), 27 #include <linux/buffer_head.h> /* for try_to_release_page(),
28 buffer_heads_over_limit */ 28 buffer_heads_over_limit */
29 #include <linux/mm_inline.h> 29 #include <linux/mm_inline.h>
30 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h>
31 #include <linux/rmap.h> 31 #include <linux/rmap.h>
32 #include <linux/topology.h> 32 #include <linux/topology.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/cpuset.h> 34 #include <linux/cpuset.h>
35 #include <linux/compaction.h> 35 #include <linux/compaction.h>
36 #include <linux/notifier.h> 36 #include <linux/notifier.h>
37 #include <linux/rwsem.h> 37 #include <linux/rwsem.h>
38 #include <linux/delay.h> 38 #include <linux/delay.h>
39 #include <linux/kthread.h> 39 #include <linux/kthread.h>
40 #include <linux/freezer.h> 40 #include <linux/freezer.h>
41 #include <linux/memcontrol.h> 41 #include <linux/memcontrol.h>
42 #include <linux/delayacct.h> 42 #include <linux/delayacct.h>
43 #include <linux/sysctl.h> 43 #include <linux/sysctl.h>
44 #include <linux/oom.h> 44 #include <linux/oom.h>
45 #include <linux/prefetch.h> 45 #include <linux/prefetch.h>
46 46
47 #include <asm/tlbflush.h> 47 #include <asm/tlbflush.h>
48 #include <asm/div64.h> 48 #include <asm/div64.h>
49 49
50 #include <linux/swapops.h> 50 #include <linux/swapops.h>
51 #include <linux/balloon_compaction.h> 51 #include <linux/balloon_compaction.h>
52 52
53 #include "internal.h" 53 #include "internal.h"
54 54
55 #define CREATE_TRACE_POINTS 55 #define CREATE_TRACE_POINTS
56 #include <trace/events/vmscan.h> 56 #include <trace/events/vmscan.h>
57 57
58 struct scan_control { 58 struct scan_control {
59 /* Incremented by the number of inactive pages that were scanned */ 59 /* Incremented by the number of inactive pages that were scanned */
60 unsigned long nr_scanned; 60 unsigned long nr_scanned;
61 61
62 /* Number of pages freed so far during a call to shrink_zones() */ 62 /* Number of pages freed so far during a call to shrink_zones() */
63 unsigned long nr_reclaimed; 63 unsigned long nr_reclaimed;
64 64
65 /* How many pages shrink_list() should reclaim */ 65 /* How many pages shrink_list() should reclaim */
66 unsigned long nr_to_reclaim; 66 unsigned long nr_to_reclaim;
67 67
68 unsigned long hibernation_mode; 68 unsigned long hibernation_mode;
69 69
70 /* This context's GFP mask */ 70 /* This context's GFP mask */
71 gfp_t gfp_mask; 71 gfp_t gfp_mask;
72 72
73 int may_writepage; 73 int may_writepage;
74 74
75 /* Can mapped pages be reclaimed? */ 75 /* Can mapped pages be reclaimed? */
76 int may_unmap; 76 int may_unmap;
77 77
78 /* Can pages be swapped as part of reclaim? */ 78 /* Can pages be swapped as part of reclaim? */
79 int may_swap; 79 int may_swap;
80 80
81 int order; 81 int order;
82 82
83 /* Scan (total_size >> priority) pages at once */ 83 /* Scan (total_size >> priority) pages at once */
84 int priority; 84 int priority;
85 85
86 /* 86 /*
87 * The memory cgroup that hit its limit and as a result is the 87 * The memory cgroup that hit its limit and as a result is the
88 * primary target of this reclaim invocation. 88 * primary target of this reclaim invocation.
89 */ 89 */
90 struct mem_cgroup *target_mem_cgroup; 90 struct mem_cgroup *target_mem_cgroup;
91 91
92 /* 92 /*
93 * Nodemask of nodes allowed by the caller. If NULL, all nodes 93 * Nodemask of nodes allowed by the caller. If NULL, all nodes
94 * are scanned. 94 * are scanned.
95 */ 95 */
96 nodemask_t *nodemask; 96 nodemask_t *nodemask;
97 }; 97 };
98 98
99 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 99 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
100 100
101 #ifdef ARCH_HAS_PREFETCH 101 #ifdef ARCH_HAS_PREFETCH
102 #define prefetch_prev_lru_page(_page, _base, _field) \ 102 #define prefetch_prev_lru_page(_page, _base, _field) \
103 do { \ 103 do { \
104 if ((_page)->lru.prev != _base) { \ 104 if ((_page)->lru.prev != _base) { \
105 struct page *prev; \ 105 struct page *prev; \
106 \ 106 \
107 prev = lru_to_page(&(_page->lru)); \ 107 prev = lru_to_page(&(_page->lru)); \
108 prefetch(&prev->_field); \ 108 prefetch(&prev->_field); \
109 } \ 109 } \
110 } while (0) 110 } while (0)
111 #else 111 #else
112 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 112 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
113 #endif 113 #endif
114 114
115 #ifdef ARCH_HAS_PREFETCHW 115 #ifdef ARCH_HAS_PREFETCHW
116 #define prefetchw_prev_lru_page(_page, _base, _field) \ 116 #define prefetchw_prev_lru_page(_page, _base, _field) \
117 do { \ 117 do { \
118 if ((_page)->lru.prev != _base) { \ 118 if ((_page)->lru.prev != _base) { \
119 struct page *prev; \ 119 struct page *prev; \
120 \ 120 \
121 prev = lru_to_page(&(_page->lru)); \ 121 prev = lru_to_page(&(_page->lru)); \
122 prefetchw(&prev->_field); \ 122 prefetchw(&prev->_field); \
123 } \ 123 } \
124 } while (0) 124 } while (0)
125 #else 125 #else
126 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 126 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
127 #endif 127 #endif
128 128
129 /* 129 /*
130 * From 0 .. 100. Higher means more swappy. 130 * From 0 .. 100. Higher means more swappy.
131 */ 131 */
132 int vm_swappiness = 60; 132 int vm_swappiness = 60;
133 unsigned long vm_total_pages; /* The total number of pages which the VM controls */ 133 unsigned long vm_total_pages; /* The total number of pages which the VM controls */
134 134
135 static LIST_HEAD(shrinker_list); 135 static LIST_HEAD(shrinker_list);
136 static DECLARE_RWSEM(shrinker_rwsem); 136 static DECLARE_RWSEM(shrinker_rwsem);
137 137
138 #ifdef CONFIG_MEMCG 138 #ifdef CONFIG_MEMCG
139 static bool global_reclaim(struct scan_control *sc) 139 static bool global_reclaim(struct scan_control *sc)
140 { 140 {
141 return !sc->target_mem_cgroup; 141 return !sc->target_mem_cgroup;
142 } 142 }
143 #else 143 #else
144 static bool global_reclaim(struct scan_control *sc) 144 static bool global_reclaim(struct scan_control *sc)
145 { 145 {
146 return true; 146 return true;
147 } 147 }
148 #endif 148 #endif
149 149
150 static unsigned long zone_reclaimable_pages(struct zone *zone) 150 static unsigned long zone_reclaimable_pages(struct zone *zone)
151 { 151 {
152 int nr; 152 int nr;
153 153
154 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 154 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
155 zone_page_state(zone, NR_INACTIVE_FILE); 155 zone_page_state(zone, NR_INACTIVE_FILE);
156 156
157 if (get_nr_swap_pages() > 0) 157 if (get_nr_swap_pages() > 0)
158 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 158 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
159 zone_page_state(zone, NR_INACTIVE_ANON); 159 zone_page_state(zone, NR_INACTIVE_ANON);
160 160
161 return nr; 161 return nr;
162 } 162 }
163 163
164 bool zone_reclaimable(struct zone *zone) 164 bool zone_reclaimable(struct zone *zone)
165 { 165 {
166 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 166 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
167 } 167 }
168 168
169 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 169 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
170 { 170 {
171 if (!mem_cgroup_disabled()) 171 if (!mem_cgroup_disabled())
172 return mem_cgroup_get_lru_size(lruvec, lru); 172 return mem_cgroup_get_lru_size(lruvec, lru);
173 173
174 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); 174 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
175 } 175 }
176 176
177 /* 177 /*
178 * Add a shrinker callback to be called from the vm. 178 * Add a shrinker callback to be called from the vm.
179 */ 179 */
180 int register_shrinker(struct shrinker *shrinker) 180 int register_shrinker(struct shrinker *shrinker)
181 { 181 {
182 size_t size = sizeof(*shrinker->nr_deferred); 182 size_t size = sizeof(*shrinker->nr_deferred);
183 183
184 /* 184 /*
185 * If we only have one possible node in the system anyway, save 185 * If we only have one possible node in the system anyway, save
186 * ourselves the trouble and disable NUMA aware behavior. This way we 186 * ourselves the trouble and disable NUMA aware behavior. This way we
187 * will save memory and some small loop time later. 187 * will save memory and some small loop time later.
188 */ 188 */
189 if (nr_node_ids == 1) 189 if (nr_node_ids == 1)
190 shrinker->flags &= ~SHRINKER_NUMA_AWARE; 190 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
191 191
192 if (shrinker->flags & SHRINKER_NUMA_AWARE) 192 if (shrinker->flags & SHRINKER_NUMA_AWARE)
193 size *= nr_node_ids; 193 size *= nr_node_ids;
194 194
195 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 195 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
196 if (!shrinker->nr_deferred) 196 if (!shrinker->nr_deferred)
197 return -ENOMEM; 197 return -ENOMEM;
198 198
199 down_write(&shrinker_rwsem); 199 down_write(&shrinker_rwsem);
200 list_add_tail(&shrinker->list, &shrinker_list); 200 list_add_tail(&shrinker->list, &shrinker_list);
201 up_write(&shrinker_rwsem); 201 up_write(&shrinker_rwsem);
202 return 0; 202 return 0;
203 } 203 }
204 EXPORT_SYMBOL(register_shrinker); 204 EXPORT_SYMBOL(register_shrinker);
205 205
206 /* 206 /*
207 * Remove one 207 * Remove one
208 */ 208 */
209 void unregister_shrinker(struct shrinker *shrinker) 209 void unregister_shrinker(struct shrinker *shrinker)
210 { 210 {
211 down_write(&shrinker_rwsem); 211 down_write(&shrinker_rwsem);
212 list_del(&shrinker->list); 212 list_del(&shrinker->list);
213 up_write(&shrinker_rwsem); 213 up_write(&shrinker_rwsem);
214 kfree(shrinker->nr_deferred); 214 kfree(shrinker->nr_deferred);
215 } 215 }
216 EXPORT_SYMBOL(unregister_shrinker); 216 EXPORT_SYMBOL(unregister_shrinker);
217 217
218 #define SHRINK_BATCH 128 218 #define SHRINK_BATCH 128
219 219
220 static unsigned long 220 static unsigned long
221 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, 221 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
222 unsigned long nr_pages_scanned, unsigned long lru_pages) 222 unsigned long nr_pages_scanned, unsigned long lru_pages)
223 { 223 {
224 unsigned long freed = 0; 224 unsigned long freed = 0;
225 unsigned long long delta; 225 unsigned long long delta;
226 long total_scan; 226 long total_scan;
227 long freeable; 227 long freeable;
228 long nr; 228 long nr;
229 long new_nr; 229 long new_nr;
230 int nid = shrinkctl->nid; 230 int nid = shrinkctl->nid;
231 long batch_size = shrinker->batch ? shrinker->batch 231 long batch_size = shrinker->batch ? shrinker->batch
232 : SHRINK_BATCH; 232 : SHRINK_BATCH;
233 233
234 freeable = shrinker->count_objects(shrinker, shrinkctl); 234 freeable = shrinker->count_objects(shrinker, shrinkctl);
235 if (freeable == 0) 235 if (freeable == 0)
236 return 0; 236 return 0;
237 237
238 /* 238 /*
239 * copy the current shrinker scan count into a local variable 239 * copy the current shrinker scan count into a local variable
240 * and zero it so that other concurrent shrinker invocations 240 * and zero it so that other concurrent shrinker invocations
241 * don't also do this scanning work. 241 * don't also do this scanning work.
242 */ 242 */
243 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 243 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
244 244
245 total_scan = nr; 245 total_scan = nr;
246 delta = (4 * nr_pages_scanned) / shrinker->seeks; 246 delta = (4 * nr_pages_scanned) / shrinker->seeks;
247 delta *= freeable; 247 delta *= freeable;
248 do_div(delta, lru_pages + 1); 248 do_div(delta, lru_pages + 1);
249 total_scan += delta; 249 total_scan += delta;
250 if (total_scan < 0) { 250 if (total_scan < 0) {
251 printk(KERN_ERR 251 printk(KERN_ERR
252 "shrink_slab: %pF negative objects to delete nr=%ld\n", 252 "shrink_slab: %pF negative objects to delete nr=%ld\n",
253 shrinker->scan_objects, total_scan); 253 shrinker->scan_objects, total_scan);
254 total_scan = freeable; 254 total_scan = freeable;
255 } 255 }
256 256
257 /* 257 /*
258 * We need to avoid excessive windup on filesystem shrinkers 258 * We need to avoid excessive windup on filesystem shrinkers
259 * due to large numbers of GFP_NOFS allocations causing the 259 * due to large numbers of GFP_NOFS allocations causing the
260 * shrinkers to return -1 all the time. This results in a large 260 * shrinkers to return -1 all the time. This results in a large
261 * nr being built up so when a shrink that can do some work 261 * nr being built up so when a shrink that can do some work
262 * comes along it empties the entire cache due to nr >>> 262 * comes along it empties the entire cache due to nr >>>
263 * freeable. This is bad for sustaining a working set in 263 * freeable. This is bad for sustaining a working set in
264 * memory. 264 * memory.
265 * 265 *
266 * Hence only allow the shrinker to scan the entire cache when 266 * Hence only allow the shrinker to scan the entire cache when
267 * a large delta change is calculated directly. 267 * a large delta change is calculated directly.
268 */ 268 */
269 if (delta < freeable / 4) 269 if (delta < freeable / 4)
270 total_scan = min(total_scan, freeable / 2); 270 total_scan = min(total_scan, freeable / 2);
271 271
272 /* 272 /*
273 * Avoid risking looping forever due to too large nr value: 273 * Avoid risking looping forever due to too large nr value:
274 * never try to free more than twice the estimate number of 274 * never try to free more than twice the estimate number of
275 * freeable entries. 275 * freeable entries.
276 */ 276 */
277 if (total_scan > freeable * 2) 277 if (total_scan > freeable * 2)
278 total_scan = freeable * 2; 278 total_scan = freeable * 2;
279 279
280 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 280 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
281 nr_pages_scanned, lru_pages, 281 nr_pages_scanned, lru_pages,
282 freeable, delta, total_scan); 282 freeable, delta, total_scan);
283 283
284 /* 284 /*
285 * Normally, we should not scan less than batch_size objects in one 285 * Normally, we should not scan less than batch_size objects in one
286 * pass to avoid too frequent shrinker calls, but if the slab has less 286 * pass to avoid too frequent shrinker calls, but if the slab has less
287 * than batch_size objects in total and we are really tight on memory, 287 * than batch_size objects in total and we are really tight on memory,
288 * we will try to reclaim all available objects, otherwise we can end 288 * we will try to reclaim all available objects, otherwise we can end
289 * up failing allocations although there are plenty of reclaimable 289 * up failing allocations although there are plenty of reclaimable
290 * objects spread over several slabs with usage less than the 290 * objects spread over several slabs with usage less than the
291 * batch_size. 291 * batch_size.
292 * 292 *
293 * We detect the "tight on memory" situations by looking at the total 293 * We detect the "tight on memory" situations by looking at the total
294 * number of objects we want to scan (total_scan). If it is greater 294 * number of objects we want to scan (total_scan). If it is greater
295 * than the total number of objects on slab (freeable), we must be 295 * than the total number of objects on slab (freeable), we must be
296 * scanning at high prio and therefore should try to reclaim as much as 296 * scanning at high prio and therefore should try to reclaim as much as
297 * possible. 297 * possible.
298 */ 298 */
299 while (total_scan >= batch_size || 299 while (total_scan >= batch_size ||
300 total_scan >= freeable) { 300 total_scan >= freeable) {
301 unsigned long ret; 301 unsigned long ret;
302 unsigned long nr_to_scan = min(batch_size, total_scan); 302 unsigned long nr_to_scan = min(batch_size, total_scan);
303 303
304 shrinkctl->nr_to_scan = nr_to_scan; 304 shrinkctl->nr_to_scan = nr_to_scan;
305 ret = shrinker->scan_objects(shrinker, shrinkctl); 305 ret = shrinker->scan_objects(shrinker, shrinkctl);
306 if (ret == SHRINK_STOP) 306 if (ret == SHRINK_STOP)
307 break; 307 break;
308 freed += ret; 308 freed += ret;
309 309
310 count_vm_events(SLABS_SCANNED, nr_to_scan); 310 count_vm_events(SLABS_SCANNED, nr_to_scan);
311 total_scan -= nr_to_scan; 311 total_scan -= nr_to_scan;
312 312
313 cond_resched(); 313 cond_resched();
314 } 314 }
315 315
316 /* 316 /*
317 * move the unused scan count back into the shrinker in a 317 * move the unused scan count back into the shrinker in a
318 * manner that handles concurrent updates. If we exhausted the 318 * manner that handles concurrent updates. If we exhausted the
319 * scan, there is no need to do an update. 319 * scan, there is no need to do an update.
320 */ 320 */
321 if (total_scan > 0) 321 if (total_scan > 0)
322 new_nr = atomic_long_add_return(total_scan, 322 new_nr = atomic_long_add_return(total_scan,
323 &shrinker->nr_deferred[nid]); 323 &shrinker->nr_deferred[nid]);
324 else 324 else
325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
326 326
327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); 327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
328 return freed; 328 return freed;
329 } 329 }
330 330
331 /* 331 /*
332 * Call the shrink functions to age shrinkable caches 332 * Call the shrink functions to age shrinkable caches
333 * 333 *
334 * Here we assume it costs one seek to replace a lru page and that it also 334 * Here we assume it costs one seek to replace a lru page and that it also
335 * takes a seek to recreate a cache object. With this in mind we age equal 335 * takes a seek to recreate a cache object. With this in mind we age equal
336 * percentages of the lru and ageable caches. This should balance the seeks 336 * percentages of the lru and ageable caches. This should balance the seeks
337 * generated by these structures. 337 * generated by these structures.
338 * 338 *
339 * If the vm encountered mapped pages on the LRU it increase the pressure on 339 * If the vm encountered mapped pages on the LRU it increase the pressure on
340 * slab to avoid swapping. 340 * slab to avoid swapping.
341 * 341 *
342 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 342 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
343 * 343 *
344 * `lru_pages' represents the number of on-LRU pages in all the zones which 344 * `lru_pages' represents the number of on-LRU pages in all the zones which
345 * are eligible for the caller's allocation attempt. It is used for balancing 345 * are eligible for the caller's allocation attempt. It is used for balancing
346 * slab reclaim versus page reclaim. 346 * slab reclaim versus page reclaim.
347 * 347 *
348 * Returns the number of slab objects which we shrunk. 348 * Returns the number of slab objects which we shrunk.
349 */ 349 */
350 unsigned long shrink_slab(struct shrink_control *shrinkctl, 350 unsigned long shrink_slab(struct shrink_control *shrinkctl,
351 unsigned long nr_pages_scanned, 351 unsigned long nr_pages_scanned,
352 unsigned long lru_pages) 352 unsigned long lru_pages)
353 { 353 {
354 struct shrinker *shrinker; 354 struct shrinker *shrinker;
355 unsigned long freed = 0; 355 unsigned long freed = 0;
356 356
357 if (nr_pages_scanned == 0) 357 if (nr_pages_scanned == 0)
358 nr_pages_scanned = SWAP_CLUSTER_MAX; 358 nr_pages_scanned = SWAP_CLUSTER_MAX;
359 359
360 if (!down_read_trylock(&shrinker_rwsem)) { 360 if (!down_read_trylock(&shrinker_rwsem)) {
361 /* 361 /*
362 * If we would return 0, our callers would understand that we 362 * If we would return 0, our callers would understand that we
363 * have nothing else to shrink and give up trying. By returning 363 * have nothing else to shrink and give up trying. By returning
364 * 1 we keep it going and assume we'll be able to shrink next 364 * 1 we keep it going and assume we'll be able to shrink next
365 * time. 365 * time.
366 */ 366 */
367 freed = 1; 367 freed = 1;
368 goto out; 368 goto out;
369 } 369 }
370 370
371 list_for_each_entry(shrinker, &shrinker_list, list) { 371 list_for_each_entry(shrinker, &shrinker_list, list) {
372 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { 372 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
373 shrinkctl->nid = 0; 373 shrinkctl->nid = 0;
374 freed += shrink_slab_node(shrinkctl, shrinker, 374 freed += shrink_slab_node(shrinkctl, shrinker,
375 nr_pages_scanned, lru_pages); 375 nr_pages_scanned, lru_pages);
376 continue; 376 continue;
377 } 377 }
378 378
379 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 379 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
380 if (node_online(shrinkctl->nid)) 380 if (node_online(shrinkctl->nid))
381 freed += shrink_slab_node(shrinkctl, shrinker, 381 freed += shrink_slab_node(shrinkctl, shrinker,
382 nr_pages_scanned, lru_pages); 382 nr_pages_scanned, lru_pages);
383 383
384 } 384 }
385 } 385 }
386 up_read(&shrinker_rwsem); 386 up_read(&shrinker_rwsem);
387 out: 387 out:
388 cond_resched(); 388 cond_resched();
389 return freed; 389 return freed;
390 } 390 }
391 391
392 static inline int is_page_cache_freeable(struct page *page) 392 static inline int is_page_cache_freeable(struct page *page)
393 { 393 {
394 /* 394 /*
395 * A freeable page cache page is referenced only by the caller 395 * A freeable page cache page is referenced only by the caller
396 * that isolated the page, the page cache radix tree and 396 * that isolated the page, the page cache radix tree and
397 * optional buffer heads at page->private. 397 * optional buffer heads at page->private.
398 */ 398 */
399 return page_count(page) - page_has_private(page) == 2; 399 return page_count(page) - page_has_private(page) == 2;
400 } 400 }
401 401
402 static int may_write_to_queue(struct backing_dev_info *bdi, 402 static int may_write_to_queue(struct backing_dev_info *bdi,
403 struct scan_control *sc) 403 struct scan_control *sc)
404 { 404 {
405 if (current->flags & PF_SWAPWRITE) 405 if (current->flags & PF_SWAPWRITE)
406 return 1; 406 return 1;
407 if (!bdi_write_congested(bdi)) 407 if (!bdi_write_congested(bdi))
408 return 1; 408 return 1;
409 if (bdi == current->backing_dev_info) 409 if (bdi == current->backing_dev_info)
410 return 1; 410 return 1;
411 return 0; 411 return 0;
412 } 412 }
413 413
414 /* 414 /*
415 * We detected a synchronous write error writing a page out. Probably 415 * We detected a synchronous write error writing a page out. Probably
416 * -ENOSPC. We need to propagate that into the address_space for a subsequent 416 * -ENOSPC. We need to propagate that into the address_space for a subsequent
417 * fsync(), msync() or close(). 417 * fsync(), msync() or close().
418 * 418 *
419 * The tricky part is that after writepage we cannot touch the mapping: nothing 419 * The tricky part is that after writepage we cannot touch the mapping: nothing
420 * prevents it from being freed up. But we have a ref on the page and once 420 * prevents it from being freed up. But we have a ref on the page and once
421 * that page is locked, the mapping is pinned. 421 * that page is locked, the mapping is pinned.
422 * 422 *
423 * We're allowed to run sleeping lock_page() here because we know the caller has 423 * We're allowed to run sleeping lock_page() here because we know the caller has
424 * __GFP_FS. 424 * __GFP_FS.
425 */ 425 */
426 static void handle_write_error(struct address_space *mapping, 426 static void handle_write_error(struct address_space *mapping,
427 struct page *page, int error) 427 struct page *page, int error)
428 { 428 {
429 lock_page(page); 429 lock_page(page);
430 if (page_mapping(page) == mapping) 430 if (page_mapping(page) == mapping)
431 mapping_set_error(mapping, error); 431 mapping_set_error(mapping, error);
432 unlock_page(page); 432 unlock_page(page);
433 } 433 }
434 434
435 /* possible outcome of pageout() */ 435 /* possible outcome of pageout() */
436 typedef enum { 436 typedef enum {
437 /* failed to write page out, page is locked */ 437 /* failed to write page out, page is locked */
438 PAGE_KEEP, 438 PAGE_KEEP,
439 /* move page to the active list, page is locked */ 439 /* move page to the active list, page is locked */
440 PAGE_ACTIVATE, 440 PAGE_ACTIVATE,
441 /* page has been sent to the disk successfully, page is unlocked */ 441 /* page has been sent to the disk successfully, page is unlocked */
442 PAGE_SUCCESS, 442 PAGE_SUCCESS,
443 /* page is clean and locked */ 443 /* page is clean and locked */
444 PAGE_CLEAN, 444 PAGE_CLEAN,
445 } pageout_t; 445 } pageout_t;
446 446
447 /* 447 /*
448 * pageout is called by shrink_page_list() for each dirty page. 448 * pageout is called by shrink_page_list() for each dirty page.
449 * Calls ->writepage(). 449 * Calls ->writepage().
450 */ 450 */
451 static pageout_t pageout(struct page *page, struct address_space *mapping, 451 static pageout_t pageout(struct page *page, struct address_space *mapping,
452 struct scan_control *sc) 452 struct scan_control *sc)
453 { 453 {
454 /* 454 /*
455 * If the page is dirty, only perform writeback if that write 455 * If the page is dirty, only perform writeback if that write
456 * will be non-blocking. To prevent this allocation from being 456 * will be non-blocking. To prevent this allocation from being
457 * stalled by pagecache activity. But note that there may be 457 * stalled by pagecache activity. But note that there may be
458 * stalls if we need to run get_block(). We could test 458 * stalls if we need to run get_block(). We could test
459 * PagePrivate for that. 459 * PagePrivate for that.
460 * 460 *
461 * If this process is currently in __generic_file_aio_write() against 461 * If this process is currently in __generic_file_aio_write() against
462 * this page's queue, we can perform writeback even if that 462 * this page's queue, we can perform writeback even if that
463 * will block. 463 * will block.
464 * 464 *
465 * If the page is swapcache, write it back even if that would 465 * If the page is swapcache, write it back even if that would
466 * block, for some throttling. This happens by accident, because 466 * block, for some throttling. This happens by accident, because
467 * swap_backing_dev_info is bust: it doesn't reflect the 467 * swap_backing_dev_info is bust: it doesn't reflect the
468 * congestion state of the swapdevs. Easy to fix, if needed. 468 * congestion state of the swapdevs. Easy to fix, if needed.
469 */ 469 */
470 if (!is_page_cache_freeable(page)) 470 if (!is_page_cache_freeable(page))
471 return PAGE_KEEP; 471 return PAGE_KEEP;
472 if (!mapping) { 472 if (!mapping) {
473 /* 473 /*
474 * Some data journaling orphaned pages can have 474 * Some data journaling orphaned pages can have
475 * page->mapping == NULL while being dirty with clean buffers. 475 * page->mapping == NULL while being dirty with clean buffers.
476 */ 476 */
477 if (page_has_private(page)) { 477 if (page_has_private(page)) {
478 if (try_to_free_buffers(page)) { 478 if (try_to_free_buffers(page)) {
479 ClearPageDirty(page); 479 ClearPageDirty(page);
480 printk("%s: orphaned page\n", __func__); 480 printk("%s: orphaned page\n", __func__);
481 return PAGE_CLEAN; 481 return PAGE_CLEAN;
482 } 482 }
483 } 483 }
484 return PAGE_KEEP; 484 return PAGE_KEEP;
485 } 485 }
486 if (mapping->a_ops->writepage == NULL) 486 if (mapping->a_ops->writepage == NULL)
487 return PAGE_ACTIVATE; 487 return PAGE_ACTIVATE;
488 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 488 if (!may_write_to_queue(mapping->backing_dev_info, sc))
489 return PAGE_KEEP; 489 return PAGE_KEEP;
490 490
491 if (clear_page_dirty_for_io(page)) { 491 if (clear_page_dirty_for_io(page)) {
492 int res; 492 int res;
493 struct writeback_control wbc = { 493 struct writeback_control wbc = {
494 .sync_mode = WB_SYNC_NONE, 494 .sync_mode = WB_SYNC_NONE,
495 .nr_to_write = SWAP_CLUSTER_MAX, 495 .nr_to_write = SWAP_CLUSTER_MAX,
496 .range_start = 0, 496 .range_start = 0,
497 .range_end = LLONG_MAX, 497 .range_end = LLONG_MAX,
498 .for_reclaim = 1, 498 .for_reclaim = 1,
499 }; 499 };
500 500
501 SetPageReclaim(page); 501 SetPageReclaim(page);
502 res = mapping->a_ops->writepage(page, &wbc); 502 res = mapping->a_ops->writepage(page, &wbc);
503 if (res < 0) 503 if (res < 0)
504 handle_write_error(mapping, page, res); 504 handle_write_error(mapping, page, res);
505 if (res == AOP_WRITEPAGE_ACTIVATE) { 505 if (res == AOP_WRITEPAGE_ACTIVATE) {
506 ClearPageReclaim(page); 506 ClearPageReclaim(page);
507 return PAGE_ACTIVATE; 507 return PAGE_ACTIVATE;
508 } 508 }
509 509
510 if (!PageWriteback(page)) { 510 if (!PageWriteback(page)) {
511 /* synchronous write or broken a_ops? */ 511 /* synchronous write or broken a_ops? */
512 ClearPageReclaim(page); 512 ClearPageReclaim(page);
513 } 513 }
514 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); 514 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
515 inc_zone_page_state(page, NR_VMSCAN_WRITE); 515 inc_zone_page_state(page, NR_VMSCAN_WRITE);
516 return PAGE_SUCCESS; 516 return PAGE_SUCCESS;
517 } 517 }
518 518
519 return PAGE_CLEAN; 519 return PAGE_CLEAN;
520 } 520 }
521 521
522 /* 522 /*
523 * Same as remove_mapping, but if the page is removed from the mapping, it 523 * Same as remove_mapping, but if the page is removed from the mapping, it
524 * gets returned with a refcount of 0. 524 * gets returned with a refcount of 0.
525 */ 525 */
526 static int __remove_mapping(struct address_space *mapping, struct page *page) 526 static int __remove_mapping(struct address_space *mapping, struct page *page)
527 { 527 {
528 BUG_ON(!PageLocked(page)); 528 BUG_ON(!PageLocked(page));
529 BUG_ON(mapping != page_mapping(page)); 529 BUG_ON(mapping != page_mapping(page));
530 530
531 spin_lock_irq(&mapping->tree_lock); 531 spin_lock_irq(&mapping->tree_lock);
532 /* 532 /*
533 * The non racy check for a busy page. 533 * The non racy check for a busy page.
534 * 534 *
535 * Must be careful with the order of the tests. When someone has 535 * Must be careful with the order of the tests. When someone has
536 * a ref to the page, it may be possible that they dirty it then 536 * a ref to the page, it may be possible that they dirty it then
537 * drop the reference. So if PageDirty is tested before page_count 537 * drop the reference. So if PageDirty is tested before page_count
538 * here, then the following race may occur: 538 * here, then the following race may occur:
539 * 539 *
540 * get_user_pages(&page); 540 * get_user_pages(&page);
541 * [user mapping goes away] 541 * [user mapping goes away]
542 * write_to(page); 542 * write_to(page);
543 * !PageDirty(page) [good] 543 * !PageDirty(page) [good]
544 * SetPageDirty(page); 544 * SetPageDirty(page);
545 * put_page(page); 545 * put_page(page);
546 * !page_count(page) [good, discard it] 546 * !page_count(page) [good, discard it]
547 * 547 *
548 * [oops, our write_to data is lost] 548 * [oops, our write_to data is lost]
549 * 549 *
550 * Reversing the order of the tests ensures such a situation cannot 550 * Reversing the order of the tests ensures such a situation cannot
551 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 551 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
552 * load is not satisfied before that of page->_count. 552 * load is not satisfied before that of page->_count.
553 * 553 *
554 * Note that if SetPageDirty is always performed via set_page_dirty, 554 * Note that if SetPageDirty is always performed via set_page_dirty,
555 * and thus under tree_lock, then this ordering is not required. 555 * and thus under tree_lock, then this ordering is not required.
556 */ 556 */
557 if (!page_freeze_refs(page, 2)) 557 if (!page_freeze_refs(page, 2))
558 goto cannot_free; 558 goto cannot_free;
559 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 559 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
560 if (unlikely(PageDirty(page))) { 560 if (unlikely(PageDirty(page))) {
561 page_unfreeze_refs(page, 2); 561 page_unfreeze_refs(page, 2);
562 goto cannot_free; 562 goto cannot_free;
563 } 563 }
564 564
565 if (PageSwapCache(page)) { 565 if (PageSwapCache(page)) {
566 swp_entry_t swap = { .val = page_private(page) }; 566 swp_entry_t swap = { .val = page_private(page) };
567 __delete_from_swap_cache(page); 567 __delete_from_swap_cache(page);
568 spin_unlock_irq(&mapping->tree_lock); 568 spin_unlock_irq(&mapping->tree_lock);
569 swapcache_free(swap, page); 569 swapcache_free(swap, page);
570 } else { 570 } else {
571 void (*freepage)(struct page *); 571 void (*freepage)(struct page *);
572 572
573 freepage = mapping->a_ops->freepage; 573 freepage = mapping->a_ops->freepage;
574 574
575 __delete_from_page_cache(page); 575 __delete_from_page_cache(page);
576 spin_unlock_irq(&mapping->tree_lock); 576 spin_unlock_irq(&mapping->tree_lock);
577 mem_cgroup_uncharge_cache_page(page); 577 mem_cgroup_uncharge_cache_page(page);
578 578
579 if (freepage != NULL) 579 if (freepage != NULL)
580 freepage(page); 580 freepage(page);
581 } 581 }
582 582
583 return 1; 583 return 1;
584 584
585 cannot_free: 585 cannot_free:
586 spin_unlock_irq(&mapping->tree_lock); 586 spin_unlock_irq(&mapping->tree_lock);
587 return 0; 587 return 0;
588 } 588 }
589 589
590 /* 590 /*
591 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 591 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
592 * someone else has a ref on the page, abort and return 0. If it was 592 * someone else has a ref on the page, abort and return 0. If it was
593 * successfully detached, return 1. Assumes the caller has a single ref on 593 * successfully detached, return 1. Assumes the caller has a single ref on
594 * this page. 594 * this page.
595 */ 595 */
596 int remove_mapping(struct address_space *mapping, struct page *page) 596 int remove_mapping(struct address_space *mapping, struct page *page)
597 { 597 {
598 if (__remove_mapping(mapping, page)) { 598 if (__remove_mapping(mapping, page)) {
599 /* 599 /*
600 * Unfreezing the refcount with 1 rather than 2 effectively 600 * Unfreezing the refcount with 1 rather than 2 effectively
601 * drops the pagecache ref for us without requiring another 601 * drops the pagecache ref for us without requiring another
602 * atomic operation. 602 * atomic operation.
603 */ 603 */
604 page_unfreeze_refs(page, 1); 604 page_unfreeze_refs(page, 1);
605 return 1; 605 return 1;
606 } 606 }
607 return 0; 607 return 0;
608 } 608 }
609 609
610 /** 610 /**
611 * putback_lru_page - put previously isolated page onto appropriate LRU list 611 * putback_lru_page - put previously isolated page onto appropriate LRU list
612 * @page: page to be put back to appropriate lru list 612 * @page: page to be put back to appropriate lru list
613 * 613 *
614 * Add previously isolated @page to appropriate LRU list. 614 * Add previously isolated @page to appropriate LRU list.
615 * Page may still be unevictable for other reasons. 615 * Page may still be unevictable for other reasons.
616 * 616 *
617 * lru_lock must not be held, interrupts must be enabled. 617 * lru_lock must not be held, interrupts must be enabled.
618 */ 618 */
619 void putback_lru_page(struct page *page) 619 void putback_lru_page(struct page *page)
620 { 620 {
621 bool is_unevictable; 621 bool is_unevictable;
622 int was_unevictable = PageUnevictable(page); 622 int was_unevictable = PageUnevictable(page);
623 623
624 VM_BUG_ON(PageLRU(page)); 624 VM_BUG_ON(PageLRU(page));
625 625
626 redo: 626 redo:
627 ClearPageUnevictable(page); 627 ClearPageUnevictable(page);
628 628
629 if (page_evictable(page)) { 629 if (page_evictable(page)) {
630 /* 630 /*
631 * For evictable pages, we can use the cache. 631 * For evictable pages, we can use the cache.
632 * In event of a race, worst case is we end up with an 632 * In event of a race, worst case is we end up with an
633 * unevictable page on [in]active list. 633 * unevictable page on [in]active list.
634 * We know how to handle that. 634 * We know how to handle that.
635 */ 635 */
636 is_unevictable = false; 636 is_unevictable = false;
637 lru_cache_add(page); 637 lru_cache_add(page);
638 } else { 638 } else {
639 /* 639 /*
640 * Put unevictable pages directly on zone's unevictable 640 * Put unevictable pages directly on zone's unevictable
641 * list. 641 * list.
642 */ 642 */
643 is_unevictable = true; 643 is_unevictable = true;
644 add_page_to_unevictable_list(page); 644 add_page_to_unevictable_list(page);
645 /* 645 /*
646 * When racing with an mlock or AS_UNEVICTABLE clearing 646 * When racing with an mlock or AS_UNEVICTABLE clearing
647 * (page is unlocked) make sure that if the other thread 647 * (page is unlocked) make sure that if the other thread
648 * does not observe our setting of PG_lru and fails 648 * does not observe our setting of PG_lru and fails
649 * isolation/check_move_unevictable_pages, 649 * isolation/check_move_unevictable_pages,
650 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 650 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
651 * the page back to the evictable list. 651 * the page back to the evictable list.
652 * 652 *
653 * The other side is TestClearPageMlocked() or shmem_lock(). 653 * The other side is TestClearPageMlocked() or shmem_lock().
654 */ 654 */
655 smp_mb(); 655 smp_mb();
656 } 656 }
657 657
658 /* 658 /*
659 * page's status can change while we move it among lru. If an evictable 659 * page's status can change while we move it among lru. If an evictable
660 * page is on unevictable list, it never be freed. To avoid that, 660 * page is on unevictable list, it never be freed. To avoid that,
661 * check after we added it to the list, again. 661 * check after we added it to the list, again.
662 */ 662 */
663 if (is_unevictable && page_evictable(page)) { 663 if (is_unevictable && page_evictable(page)) {
664 if (!isolate_lru_page(page)) { 664 if (!isolate_lru_page(page)) {
665 put_page(page); 665 put_page(page);
666 goto redo; 666 goto redo;
667 } 667 }
668 /* This means someone else dropped this page from LRU 668 /* This means someone else dropped this page from LRU
669 * So, it will be freed or putback to LRU again. There is 669 * So, it will be freed or putback to LRU again. There is
670 * nothing to do here. 670 * nothing to do here.
671 */ 671 */
672 } 672 }
673 673
674 if (was_unevictable && !is_unevictable) 674 if (was_unevictable && !is_unevictable)
675 count_vm_event(UNEVICTABLE_PGRESCUED); 675 count_vm_event(UNEVICTABLE_PGRESCUED);
676 else if (!was_unevictable && is_unevictable) 676 else if (!was_unevictable && is_unevictable)
677 count_vm_event(UNEVICTABLE_PGCULLED); 677 count_vm_event(UNEVICTABLE_PGCULLED);
678 678
679 put_page(page); /* drop ref from isolate */ 679 put_page(page); /* drop ref from isolate */
680 } 680 }
681 681
682 enum page_references { 682 enum page_references {
683 PAGEREF_RECLAIM, 683 PAGEREF_RECLAIM,
684 PAGEREF_RECLAIM_CLEAN, 684 PAGEREF_RECLAIM_CLEAN,
685 PAGEREF_KEEP, 685 PAGEREF_KEEP,
686 PAGEREF_ACTIVATE, 686 PAGEREF_ACTIVATE,
687 }; 687 };
688 688
689 static enum page_references page_check_references(struct page *page, 689 static enum page_references page_check_references(struct page *page,
690 struct scan_control *sc) 690 struct scan_control *sc)
691 { 691 {
692 int referenced_ptes, referenced_page; 692 int referenced_ptes, referenced_page;
693 unsigned long vm_flags; 693 unsigned long vm_flags;
694 694
695 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, 695 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
696 &vm_flags); 696 &vm_flags);
697 referenced_page = TestClearPageReferenced(page); 697 referenced_page = TestClearPageReferenced(page);
698 698
699 /* 699 /*
700 * Mlock lost the isolation race with us. Let try_to_unmap() 700 * Mlock lost the isolation race with us. Let try_to_unmap()
701 * move the page to the unevictable list. 701 * move the page to the unevictable list.
702 */ 702 */
703 if (vm_flags & VM_LOCKED) 703 if (vm_flags & VM_LOCKED)
704 return PAGEREF_RECLAIM; 704 return PAGEREF_RECLAIM;
705 705
706 if (referenced_ptes) { 706 if (referenced_ptes) {
707 if (PageSwapBacked(page)) 707 if (PageSwapBacked(page))
708 return PAGEREF_ACTIVATE; 708 return PAGEREF_ACTIVATE;
709 /* 709 /*
710 * All mapped pages start out with page table 710 * All mapped pages start out with page table
711 * references from the instantiating fault, so we need 711 * references from the instantiating fault, so we need
712 * to look twice if a mapped file page is used more 712 * to look twice if a mapped file page is used more
713 * than once. 713 * than once.
714 * 714 *
715 * Mark it and spare it for another trip around the 715 * Mark it and spare it for another trip around the
716 * inactive list. Another page table reference will 716 * inactive list. Another page table reference will
717 * lead to its activation. 717 * lead to its activation.
718 * 718 *
719 * Note: the mark is set for activated pages as well 719 * Note: the mark is set for activated pages as well
720 * so that recently deactivated but used pages are 720 * so that recently deactivated but used pages are
721 * quickly recovered. 721 * quickly recovered.
722 */ 722 */
723 SetPageReferenced(page); 723 SetPageReferenced(page);
724 724
725 if (referenced_page || referenced_ptes > 1) 725 if (referenced_page || referenced_ptes > 1)
726 return PAGEREF_ACTIVATE; 726 return PAGEREF_ACTIVATE;
727 727
728 /* 728 /*
729 * Activate file-backed executable pages after first usage. 729 * Activate file-backed executable pages after first usage.
730 */ 730 */
731 if (vm_flags & VM_EXEC) 731 if (vm_flags & VM_EXEC)
732 return PAGEREF_ACTIVATE; 732 return PAGEREF_ACTIVATE;
733 733
734 return PAGEREF_KEEP; 734 return PAGEREF_KEEP;
735 } 735 }
736 736
737 /* Reclaim if clean, defer dirty pages to writeback */ 737 /* Reclaim if clean, defer dirty pages to writeback */
738 if (referenced_page && !PageSwapBacked(page)) 738 if (referenced_page && !PageSwapBacked(page))
739 return PAGEREF_RECLAIM_CLEAN; 739 return PAGEREF_RECLAIM_CLEAN;
740 740
741 return PAGEREF_RECLAIM; 741 return PAGEREF_RECLAIM;
742 } 742 }
743 743
744 /* Check if a page is dirty or under writeback */ 744 /* Check if a page is dirty or under writeback */
745 static void page_check_dirty_writeback(struct page *page, 745 static void page_check_dirty_writeback(struct page *page,
746 bool *dirty, bool *writeback) 746 bool *dirty, bool *writeback)
747 { 747 {
748 struct address_space *mapping; 748 struct address_space *mapping;
749 749
750 /* 750 /*
751 * Anonymous pages are not handled by flushers and must be written 751 * Anonymous pages are not handled by flushers and must be written
752 * from reclaim context. Do not stall reclaim based on them 752 * from reclaim context. Do not stall reclaim based on them
753 */ 753 */
754 if (!page_is_file_cache(page)) { 754 if (!page_is_file_cache(page)) {
755 *dirty = false; 755 *dirty = false;
756 *writeback = false; 756 *writeback = false;
757 return; 757 return;
758 } 758 }
759 759
760 /* By default assume that the page flags are accurate */ 760 /* By default assume that the page flags are accurate */
761 *dirty = PageDirty(page); 761 *dirty = PageDirty(page);
762 *writeback = PageWriteback(page); 762 *writeback = PageWriteback(page);
763 763
764 /* Verify dirty/writeback state if the filesystem supports it */ 764 /* Verify dirty/writeback state if the filesystem supports it */
765 if (!page_has_private(page)) 765 if (!page_has_private(page))
766 return; 766 return;
767 767
768 mapping = page_mapping(page); 768 mapping = page_mapping(page);
769 if (mapping && mapping->a_ops->is_dirty_writeback) 769 if (mapping && mapping->a_ops->is_dirty_writeback)
770 mapping->a_ops->is_dirty_writeback(page, dirty, writeback); 770 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
771 } 771 }
772 772
773 /* 773 /*
774 * shrink_page_list() returns the number of reclaimed pages 774 * shrink_page_list() returns the number of reclaimed pages
775 */ 775 */
776 static unsigned long shrink_page_list(struct list_head *page_list, 776 static unsigned long shrink_page_list(struct list_head *page_list,
777 struct zone *zone, 777 struct zone *zone,
778 struct scan_control *sc, 778 struct scan_control *sc,
779 enum ttu_flags ttu_flags, 779 enum ttu_flags ttu_flags,
780 unsigned long *ret_nr_dirty, 780 unsigned long *ret_nr_dirty,
781 unsigned long *ret_nr_unqueued_dirty, 781 unsigned long *ret_nr_unqueued_dirty,
782 unsigned long *ret_nr_congested, 782 unsigned long *ret_nr_congested,
783 unsigned long *ret_nr_writeback, 783 unsigned long *ret_nr_writeback,
784 unsigned long *ret_nr_immediate, 784 unsigned long *ret_nr_immediate,
785 bool force_reclaim) 785 bool force_reclaim)
786 { 786 {
787 LIST_HEAD(ret_pages); 787 LIST_HEAD(ret_pages);
788 LIST_HEAD(free_pages); 788 LIST_HEAD(free_pages);
789 int pgactivate = 0; 789 int pgactivate = 0;
790 unsigned long nr_unqueued_dirty = 0; 790 unsigned long nr_unqueued_dirty = 0;
791 unsigned long nr_dirty = 0; 791 unsigned long nr_dirty = 0;
792 unsigned long nr_congested = 0; 792 unsigned long nr_congested = 0;
793 unsigned long nr_reclaimed = 0; 793 unsigned long nr_reclaimed = 0;
794 unsigned long nr_writeback = 0; 794 unsigned long nr_writeback = 0;
795 unsigned long nr_immediate = 0; 795 unsigned long nr_immediate = 0;
796 796
797 cond_resched(); 797 cond_resched();
798 798
799 mem_cgroup_uncharge_start(); 799 mem_cgroup_uncharge_start();
800 while (!list_empty(page_list)) { 800 while (!list_empty(page_list)) {
801 struct address_space *mapping; 801 struct address_space *mapping;
802 struct page *page; 802 struct page *page;
803 int may_enter_fs; 803 int may_enter_fs;
804 enum page_references references = PAGEREF_RECLAIM_CLEAN; 804 enum page_references references = PAGEREF_RECLAIM_CLEAN;
805 bool dirty, writeback; 805 bool dirty, writeback;
806 806
807 cond_resched(); 807 cond_resched();
808 808
809 page = lru_to_page(page_list); 809 page = lru_to_page(page_list);
810 list_del(&page->lru); 810 list_del(&page->lru);
811 811
812 if (!trylock_page(page)) 812 if (!trylock_page(page))
813 goto keep; 813 goto keep;
814 814
815 VM_BUG_ON(PageActive(page)); 815 VM_BUG_ON(PageActive(page));
816 VM_BUG_ON(page_zone(page) != zone); 816 VM_BUG_ON(page_zone(page) != zone);
817 817
818 sc->nr_scanned++; 818 sc->nr_scanned++;
819 819
820 if (unlikely(!page_evictable(page))) 820 if (unlikely(!page_evictable(page)))
821 goto cull_mlocked; 821 goto cull_mlocked;
822 822
823 if (!sc->may_unmap && page_mapped(page)) 823 if (!sc->may_unmap && page_mapped(page))
824 goto keep_locked; 824 goto keep_locked;
825 825
826 /* Double the slab pressure for mapped and swapcache pages */ 826 /* Double the slab pressure for mapped and swapcache pages */
827 if (page_mapped(page) || PageSwapCache(page)) 827 if (page_mapped(page) || PageSwapCache(page))
828 sc->nr_scanned++; 828 sc->nr_scanned++;
829 829
830 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 830 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
831 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 831 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
832 832
833 /* 833 /*
834 * The number of dirty pages determines if a zone is marked 834 * The number of dirty pages determines if a zone is marked
835 * reclaim_congested which affects wait_iff_congested. kswapd 835 * reclaim_congested which affects wait_iff_congested. kswapd
836 * will stall and start writing pages if the tail of the LRU 836 * will stall and start writing pages if the tail of the LRU
837 * is all dirty unqueued pages. 837 * is all dirty unqueued pages.
838 */ 838 */
839 page_check_dirty_writeback(page, &dirty, &writeback); 839 page_check_dirty_writeback(page, &dirty, &writeback);
840 if (dirty || writeback) 840 if (dirty || writeback)
841 nr_dirty++; 841 nr_dirty++;
842 842
843 if (dirty && !writeback) 843 if (dirty && !writeback)
844 nr_unqueued_dirty++; 844 nr_unqueued_dirty++;
845 845
846 /* 846 /*
847 * Treat this page as congested if the underlying BDI is or if 847 * Treat this page as congested if the underlying BDI is or if
848 * pages are cycling through the LRU so quickly that the 848 * pages are cycling through the LRU so quickly that the
849 * pages marked for immediate reclaim are making it to the 849 * pages marked for immediate reclaim are making it to the
850 * end of the LRU a second time. 850 * end of the LRU a second time.
851 */ 851 */
852 mapping = page_mapping(page); 852 mapping = page_mapping(page);
853 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || 853 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
854 (writeback && PageReclaim(page))) 854 (writeback && PageReclaim(page)))
855 nr_congested++; 855 nr_congested++;
856 856
857 /* 857 /*
858 * If a page at the tail of the LRU is under writeback, there 858 * If a page at the tail of the LRU is under writeback, there
859 * are three cases to consider. 859 * are three cases to consider.
860 * 860 *
861 * 1) If reclaim is encountering an excessive number of pages 861 * 1) If reclaim is encountering an excessive number of pages
862 * under writeback and this page is both under writeback and 862 * under writeback and this page is both under writeback and
863 * PageReclaim then it indicates that pages are being queued 863 * PageReclaim then it indicates that pages are being queued
864 * for IO but are being recycled through the LRU before the 864 * for IO but are being recycled through the LRU before the
865 * IO can complete. Waiting on the page itself risks an 865 * IO can complete. Waiting on the page itself risks an
866 * indefinite stall if it is impossible to writeback the 866 * indefinite stall if it is impossible to writeback the
867 * page due to IO error or disconnected storage so instead 867 * page due to IO error or disconnected storage so instead
868 * note that the LRU is being scanned too quickly and the 868 * note that the LRU is being scanned too quickly and the
869 * caller can stall after page list has been processed. 869 * caller can stall after page list has been processed.
870 * 870 *
871 * 2) Global reclaim encounters a page, memcg encounters a 871 * 2) Global reclaim encounters a page, memcg encounters a
872 * page that is not marked for immediate reclaim or 872 * page that is not marked for immediate reclaim or
873 * the caller does not have __GFP_IO. In this case mark 873 * the caller does not have __GFP_IO. In this case mark
874 * the page for immediate reclaim and continue scanning. 874 * the page for immediate reclaim and continue scanning.
875 * 875 *
876 * __GFP_IO is checked because a loop driver thread might 876 * __GFP_IO is checked because a loop driver thread might
877 * enter reclaim, and deadlock if it waits on a page for 877 * enter reclaim, and deadlock if it waits on a page for
878 * which it is needed to do the write (loop masks off 878 * which it is needed to do the write (loop masks off
879 * __GFP_IO|__GFP_FS for this reason); but more thought 879 * __GFP_IO|__GFP_FS for this reason); but more thought
880 * would probably show more reasons. 880 * would probably show more reasons.
881 * 881 *
882 * Don't require __GFP_FS, since we're not going into the 882 * Don't require __GFP_FS, since we're not going into the
883 * FS, just waiting on its writeback completion. Worryingly, 883 * FS, just waiting on its writeback completion. Worryingly,
884 * ext4 gfs2 and xfs allocate pages with 884 * ext4 gfs2 and xfs allocate pages with
885 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing 885 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
886 * may_enter_fs here is liable to OOM on them. 886 * may_enter_fs here is liable to OOM on them.
887 * 887 *
888 * 3) memcg encounters a page that is not already marked 888 * 3) memcg encounters a page that is not already marked
889 * PageReclaim. memcg does not have any dirty pages 889 * PageReclaim. memcg does not have any dirty pages
890 * throttling so we could easily OOM just because too many 890 * throttling so we could easily OOM just because too many
891 * pages are in writeback and there is nothing else to 891 * pages are in writeback and there is nothing else to
892 * reclaim. Wait for the writeback to complete. 892 * reclaim. Wait for the writeback to complete.
893 */ 893 */
894 if (PageWriteback(page)) { 894 if (PageWriteback(page)) {
895 /* Case 1 above */ 895 /* Case 1 above */
896 if (current_is_kswapd() && 896 if (current_is_kswapd() &&
897 PageReclaim(page) && 897 PageReclaim(page) &&
898 zone_is_reclaim_writeback(zone)) { 898 zone_is_reclaim_writeback(zone)) {
899 nr_immediate++; 899 nr_immediate++;
900 goto keep_locked; 900 goto keep_locked;
901 901
902 /* Case 2 above */ 902 /* Case 2 above */
903 } else if (global_reclaim(sc) || 903 } else if (global_reclaim(sc) ||
904 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 904 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
905 /* 905 /*
906 * This is slightly racy - end_page_writeback() 906 * This is slightly racy - end_page_writeback()
907 * might have just cleared PageReclaim, then 907 * might have just cleared PageReclaim, then
908 * setting PageReclaim here end up interpreted 908 * setting PageReclaim here end up interpreted
909 * as PageReadahead - but that does not matter 909 * as PageReadahead - but that does not matter
910 * enough to care. What we do want is for this 910 * enough to care. What we do want is for this
911 * page to have PageReclaim set next time memcg 911 * page to have PageReclaim set next time memcg
912 * reclaim reaches the tests above, so it will 912 * reclaim reaches the tests above, so it will
913 * then wait_on_page_writeback() to avoid OOM; 913 * then wait_on_page_writeback() to avoid OOM;
914 * and it's also appropriate in global reclaim. 914 * and it's also appropriate in global reclaim.
915 */ 915 */
916 SetPageReclaim(page); 916 SetPageReclaim(page);
917 nr_writeback++; 917 nr_writeback++;
918 918
919 goto keep_locked; 919 goto keep_locked;
920 920
921 /* Case 3 above */ 921 /* Case 3 above */
922 } else { 922 } else {
923 wait_on_page_writeback(page); 923 wait_on_page_writeback(page);
924 } 924 }
925 } 925 }
926 926
927 if (!force_reclaim) 927 if (!force_reclaim)
928 references = page_check_references(page, sc); 928 references = page_check_references(page, sc);
929 929
930 switch (references) { 930 switch (references) {
931 case PAGEREF_ACTIVATE: 931 case PAGEREF_ACTIVATE:
932 goto activate_locked; 932 goto activate_locked;
933 case PAGEREF_KEEP: 933 case PAGEREF_KEEP:
934 goto keep_locked; 934 goto keep_locked;
935 case PAGEREF_RECLAIM: 935 case PAGEREF_RECLAIM:
936 case PAGEREF_RECLAIM_CLEAN: 936 case PAGEREF_RECLAIM_CLEAN:
937 ; /* try to reclaim the page below */ 937 ; /* try to reclaim the page below */
938 } 938 }
939 939
940 /* 940 /*
941 * Anonymous process memory has backing store? 941 * Anonymous process memory has backing store?
942 * Try to allocate it some swap space here. 942 * Try to allocate it some swap space here.
943 */ 943 */
944 if (PageAnon(page) && !PageSwapCache(page)) { 944 if (PageAnon(page) && !PageSwapCache(page)) {
945 if (!(sc->gfp_mask & __GFP_IO)) 945 if (!(sc->gfp_mask & __GFP_IO))
946 goto keep_locked; 946 goto keep_locked;
947 if (!add_to_swap(page, page_list)) 947 if (!add_to_swap(page, page_list))
948 goto activate_locked; 948 goto activate_locked;
949 may_enter_fs = 1; 949 may_enter_fs = 1;
950 950
951 /* Adding to swap updated mapping */ 951 /* Adding to swap updated mapping */
952 mapping = page_mapping(page); 952 mapping = page_mapping(page);
953 } 953 }
954 954
955 /* 955 /*
956 * The page is mapped into the page tables of one or more 956 * The page is mapped into the page tables of one or more
957 * processes. Try to unmap it here. 957 * processes. Try to unmap it here.
958 */ 958 */
959 if (page_mapped(page) && mapping) { 959 if (page_mapped(page) && mapping) {
960 switch (try_to_unmap(page, ttu_flags)) { 960 switch (try_to_unmap(page, ttu_flags)) {
961 case SWAP_FAIL: 961 case SWAP_FAIL:
962 goto activate_locked; 962 goto activate_locked;
963 case SWAP_AGAIN: 963 case SWAP_AGAIN:
964 goto keep_locked; 964 goto keep_locked;
965 case SWAP_MLOCK: 965 case SWAP_MLOCK:
966 goto cull_mlocked; 966 goto cull_mlocked;
967 case SWAP_SUCCESS: 967 case SWAP_SUCCESS:
968 ; /* try to free the page below */ 968 ; /* try to free the page below */
969 } 969 }
970 } 970 }
971 971
972 if (PageDirty(page)) { 972 if (PageDirty(page)) {
973 /* 973 /*
974 * Only kswapd can writeback filesystem pages to 974 * Only kswapd can writeback filesystem pages to
975 * avoid risk of stack overflow but only writeback 975 * avoid risk of stack overflow but only writeback
976 * if many dirty pages have been encountered. 976 * if many dirty pages have been encountered.
977 */ 977 */
978 if (page_is_file_cache(page) && 978 if (page_is_file_cache(page) &&
979 (!current_is_kswapd() || 979 (!current_is_kswapd() ||
980 !zone_is_reclaim_dirty(zone))) { 980 !zone_is_reclaim_dirty(zone))) {
981 /* 981 /*
982 * Immediately reclaim when written back. 982 * Immediately reclaim when written back.
983 * Similar in principal to deactivate_page() 983 * Similar in principal to deactivate_page()
984 * except we already have the page isolated 984 * except we already have the page isolated
985 * and know it's dirty 985 * and know it's dirty
986 */ 986 */
987 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); 987 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
988 SetPageReclaim(page); 988 SetPageReclaim(page);
989 989
990 goto keep_locked; 990 goto keep_locked;
991 } 991 }
992 992
993 if (references == PAGEREF_RECLAIM_CLEAN) 993 if (references == PAGEREF_RECLAIM_CLEAN)
994 goto keep_locked; 994 goto keep_locked;
995 if (!may_enter_fs) 995 if (!may_enter_fs)
996 goto keep_locked; 996 goto keep_locked;
997 if (!sc->may_writepage) 997 if (!sc->may_writepage)
998 goto keep_locked; 998 goto keep_locked;
999 999
1000 /* Page is dirty, try to write it out here */ 1000 /* Page is dirty, try to write it out here */
1001 switch (pageout(page, mapping, sc)) { 1001 switch (pageout(page, mapping, sc)) {
1002 case PAGE_KEEP: 1002 case PAGE_KEEP:
1003 goto keep_locked; 1003 goto keep_locked;
1004 case PAGE_ACTIVATE: 1004 case PAGE_ACTIVATE:
1005 goto activate_locked; 1005 goto activate_locked;
1006 case PAGE_SUCCESS: 1006 case PAGE_SUCCESS:
1007 if (PageWriteback(page)) 1007 if (PageWriteback(page))
1008 goto keep; 1008 goto keep;
1009 if (PageDirty(page)) 1009 if (PageDirty(page))
1010 goto keep; 1010 goto keep;
1011 1011
1012 /* 1012 /*
1013 * A synchronous write - probably a ramdisk. Go 1013 * A synchronous write - probably a ramdisk. Go
1014 * ahead and try to reclaim the page. 1014 * ahead and try to reclaim the page.
1015 */ 1015 */
1016 if (!trylock_page(page)) 1016 if (!trylock_page(page))
1017 goto keep; 1017 goto keep;
1018 if (PageDirty(page) || PageWriteback(page)) 1018 if (PageDirty(page) || PageWriteback(page))
1019 goto keep_locked; 1019 goto keep_locked;
1020 mapping = page_mapping(page); 1020 mapping = page_mapping(page);
1021 case PAGE_CLEAN: 1021 case PAGE_CLEAN:
1022 ; /* try to free the page below */ 1022 ; /* try to free the page below */
1023 } 1023 }
1024 } 1024 }
1025 1025
1026 /* 1026 /*
1027 * If the page has buffers, try to free the buffer mappings 1027 * If the page has buffers, try to free the buffer mappings
1028 * associated with this page. If we succeed we try to free 1028 * associated with this page. If we succeed we try to free
1029 * the page as well. 1029 * the page as well.
1030 * 1030 *
1031 * We do this even if the page is PageDirty(). 1031 * We do this even if the page is PageDirty().
1032 * try_to_release_page() does not perform I/O, but it is 1032 * try_to_release_page() does not perform I/O, but it is
1033 * possible for a page to have PageDirty set, but it is actually 1033 * possible for a page to have PageDirty set, but it is actually
1034 * clean (all its buffers are clean). This happens if the 1034 * clean (all its buffers are clean). This happens if the
1035 * buffers were written out directly, with submit_bh(). ext3 1035 * buffers were written out directly, with submit_bh(). ext3
1036 * will do this, as well as the blockdev mapping. 1036 * will do this, as well as the blockdev mapping.
1037 * try_to_release_page() will discover that cleanness and will 1037 * try_to_release_page() will discover that cleanness and will
1038 * drop the buffers and mark the page clean - it can be freed. 1038 * drop the buffers and mark the page clean - it can be freed.
1039 * 1039 *
1040 * Rarely, pages can have buffers and no ->mapping. These are 1040 * Rarely, pages can have buffers and no ->mapping. These are
1041 * the pages which were not successfully invalidated in 1041 * the pages which were not successfully invalidated in
1042 * truncate_complete_page(). We try to drop those buffers here 1042 * truncate_complete_page(). We try to drop those buffers here
1043 * and if that worked, and the page is no longer mapped into 1043 * and if that worked, and the page is no longer mapped into
1044 * process address space (page_count == 1) it can be freed. 1044 * process address space (page_count == 1) it can be freed.
1045 * Otherwise, leave the page on the LRU so it is swappable. 1045 * Otherwise, leave the page on the LRU so it is swappable.
1046 */ 1046 */
1047 if (page_has_private(page)) { 1047 if (page_has_private(page)) {
1048 if (!try_to_release_page(page, sc->gfp_mask)) 1048 if (!try_to_release_page(page, sc->gfp_mask))
1049 goto activate_locked; 1049 goto activate_locked;
1050 if (!mapping && page_count(page) == 1) { 1050 if (!mapping && page_count(page) == 1) {
1051 unlock_page(page); 1051 unlock_page(page);
1052 if (put_page_testzero(page)) 1052 if (put_page_testzero(page))
1053 goto free_it; 1053 goto free_it;
1054 else { 1054 else {
1055 /* 1055 /*
1056 * rare race with speculative reference. 1056 * rare race with speculative reference.
1057 * the speculative reference will free 1057 * the speculative reference will free
1058 * this page shortly, so we may 1058 * this page shortly, so we may
1059 * increment nr_reclaimed here (and 1059 * increment nr_reclaimed here (and
1060 * leave it off the LRU). 1060 * leave it off the LRU).
1061 */ 1061 */
1062 nr_reclaimed++; 1062 nr_reclaimed++;
1063 continue; 1063 continue;
1064 } 1064 }
1065 } 1065 }
1066 } 1066 }
1067 1067
1068 if (!mapping || !__remove_mapping(mapping, page)) 1068 if (!mapping || !__remove_mapping(mapping, page))
1069 goto keep_locked; 1069 goto keep_locked;
1070 1070
1071 /* 1071 /*
1072 * At this point, we have no other references and there is 1072 * At this point, we have no other references and there is
1073 * no way to pick any more up (removed from LRU, removed 1073 * no way to pick any more up (removed from LRU, removed
1074 * from pagecache). Can use non-atomic bitops now (and 1074 * from pagecache). Can use non-atomic bitops now (and
1075 * we obviously don't have to worry about waking up a process 1075 * we obviously don't have to worry about waking up a process
1076 * waiting on the page lock, because there are no references. 1076 * waiting on the page lock, because there are no references.
1077 */ 1077 */
1078 __clear_page_locked(page); 1078 __clear_page_locked(page);
1079 free_it: 1079 free_it:
1080 nr_reclaimed++; 1080 nr_reclaimed++;
1081 1081
1082 /* 1082 /*
1083 * Is there need to periodically free_page_list? It would 1083 * Is there need to periodically free_page_list? It would
1084 * appear not as the counts should be low 1084 * appear not as the counts should be low
1085 */ 1085 */
1086 list_add(&page->lru, &free_pages); 1086 list_add(&page->lru, &free_pages);
1087 continue; 1087 continue;
1088 1088
1089 cull_mlocked: 1089 cull_mlocked:
1090 if (PageSwapCache(page)) 1090 if (PageSwapCache(page))
1091 try_to_free_swap(page); 1091 try_to_free_swap(page);
1092 unlock_page(page); 1092 unlock_page(page);
1093 putback_lru_page(page); 1093 putback_lru_page(page);
1094 continue; 1094 continue;
1095 1095
1096 activate_locked: 1096 activate_locked:
1097 /* Not a candidate for swapping, so reclaim swap space. */ 1097 /* Not a candidate for swapping, so reclaim swap space. */
1098 if (PageSwapCache(page) && vm_swap_full()) 1098 if (PageSwapCache(page) && vm_swap_full())
1099 try_to_free_swap(page); 1099 try_to_free_swap(page);
1100 VM_BUG_ON(PageActive(page)); 1100 VM_BUG_ON(PageActive(page));
1101 SetPageActive(page); 1101 SetPageActive(page);
1102 pgactivate++; 1102 pgactivate++;
1103 keep_locked: 1103 keep_locked:
1104 unlock_page(page); 1104 unlock_page(page);
1105 keep: 1105 keep:
1106 list_add(&page->lru, &ret_pages); 1106 list_add(&page->lru, &ret_pages);
1107 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1107 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1108 } 1108 }
1109 1109
1110 free_hot_cold_page_list(&free_pages, true); 1110 free_hot_cold_page_list(&free_pages, true);
1111 1111
1112 list_splice(&ret_pages, page_list); 1112 list_splice(&ret_pages, page_list);
1113 count_vm_events(PGACTIVATE, pgactivate); 1113 count_vm_events(PGACTIVATE, pgactivate);
1114 mem_cgroup_uncharge_end(); 1114 mem_cgroup_uncharge_end();
1115 *ret_nr_dirty += nr_dirty; 1115 *ret_nr_dirty += nr_dirty;
1116 *ret_nr_congested += nr_congested; 1116 *ret_nr_congested += nr_congested;
1117 *ret_nr_unqueued_dirty += nr_unqueued_dirty; 1117 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
1118 *ret_nr_writeback += nr_writeback; 1118 *ret_nr_writeback += nr_writeback;
1119 *ret_nr_immediate += nr_immediate; 1119 *ret_nr_immediate += nr_immediate;
1120 return nr_reclaimed; 1120 return nr_reclaimed;
1121 } 1121 }
1122 1122
1123 unsigned long reclaim_clean_pages_from_list(struct zone *zone, 1123 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1124 struct list_head *page_list) 1124 struct list_head *page_list)
1125 { 1125 {
1126 struct scan_control sc = { 1126 struct scan_control sc = {
1127 .gfp_mask = GFP_KERNEL, 1127 .gfp_mask = GFP_KERNEL,
1128 .priority = DEF_PRIORITY, 1128 .priority = DEF_PRIORITY,
1129 .may_unmap = 1, 1129 .may_unmap = 1,
1130 }; 1130 };
1131 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; 1131 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
1132 struct page *page, *next; 1132 struct page *page, *next;
1133 LIST_HEAD(clean_pages); 1133 LIST_HEAD(clean_pages);
1134 1134
1135 list_for_each_entry_safe(page, next, page_list, lru) { 1135 list_for_each_entry_safe(page, next, page_list, lru) {
1136 if (page_is_file_cache(page) && !PageDirty(page) && 1136 if (page_is_file_cache(page) && !PageDirty(page) &&
1137 !isolated_balloon_page(page)) { 1137 !isolated_balloon_page(page)) {
1138 ClearPageActive(page); 1138 ClearPageActive(page);
1139 list_move(&page->lru, &clean_pages); 1139 list_move(&page->lru, &clean_pages);
1140 } 1140 }
1141 } 1141 }
1142 1142
1143 ret = shrink_page_list(&clean_pages, zone, &sc, 1143 ret = shrink_page_list(&clean_pages, zone, &sc,
1144 TTU_UNMAP|TTU_IGNORE_ACCESS, 1144 TTU_UNMAP|TTU_IGNORE_ACCESS,
1145 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); 1145 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1146 list_splice(&clean_pages, page_list); 1146 list_splice(&clean_pages, page_list);
1147 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1147 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
1148 return ret; 1148 return ret;
1149 } 1149 }
1150 1150
1151 /* 1151 /*
1152 * Attempt to remove the specified page from its LRU. Only take this page 1152 * Attempt to remove the specified page from its LRU. Only take this page
1153 * if it is of the appropriate PageActive status. Pages which are being 1153 * if it is of the appropriate PageActive status. Pages which are being
1154 * freed elsewhere are also ignored. 1154 * freed elsewhere are also ignored.
1155 * 1155 *
1156 * page: page to consider 1156 * page: page to consider
1157 * mode: one of the LRU isolation modes defined above 1157 * mode: one of the LRU isolation modes defined above
1158 * 1158 *
1159 * returns 0 on success, -ve errno on failure. 1159 * returns 0 on success, -ve errno on failure.
1160 */ 1160 */
1161 int __isolate_lru_page(struct page *page, isolate_mode_t mode) 1161 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1162 { 1162 {
1163 int ret = -EINVAL; 1163 int ret = -EINVAL;
1164 1164
1165 /* Only take pages on the LRU. */ 1165 /* Only take pages on the LRU. */
1166 if (!PageLRU(page)) 1166 if (!PageLRU(page))
1167 return ret; 1167 return ret;
1168 1168
1169 /* Compaction should not handle unevictable pages but CMA can do so */ 1169 /* Compaction should not handle unevictable pages but CMA can do so */
1170 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) 1170 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1171 return ret; 1171 return ret;
1172 1172
1173 ret = -EBUSY; 1173 ret = -EBUSY;
1174 1174
1175 /* 1175 /*
1176 * To minimise LRU disruption, the caller can indicate that it only 1176 * To minimise LRU disruption, the caller can indicate that it only
1177 * wants to isolate pages it will be able to operate on without 1177 * wants to isolate pages it will be able to operate on without
1178 * blocking - clean pages for the most part. 1178 * blocking - clean pages for the most part.
1179 * 1179 *
1180 * ISOLATE_CLEAN means that only clean pages should be isolated. This 1180 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1181 * is used by reclaim when it is cannot write to backing storage 1181 * is used by reclaim when it is cannot write to backing storage
1182 * 1182 *
1183 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages 1183 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1184 * that it is possible to migrate without blocking 1184 * that it is possible to migrate without blocking
1185 */ 1185 */
1186 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { 1186 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1187 /* All the caller can do on PageWriteback is block */ 1187 /* All the caller can do on PageWriteback is block */
1188 if (PageWriteback(page)) 1188 if (PageWriteback(page))
1189 return ret; 1189 return ret;
1190 1190
1191 if (PageDirty(page)) { 1191 if (PageDirty(page)) {
1192 struct address_space *mapping; 1192 struct address_space *mapping;
1193 1193
1194 /* ISOLATE_CLEAN means only clean pages */ 1194 /* ISOLATE_CLEAN means only clean pages */
1195 if (mode & ISOLATE_CLEAN) 1195 if (mode & ISOLATE_CLEAN)
1196 return ret; 1196 return ret;
1197 1197
1198 /* 1198 /*
1199 * Only pages without mappings or that have a 1199 * Only pages without mappings or that have a
1200 * ->migratepage callback are possible to migrate 1200 * ->migratepage callback are possible to migrate
1201 * without blocking 1201 * without blocking
1202 */ 1202 */
1203 mapping = page_mapping(page); 1203 mapping = page_mapping(page);
1204 if (mapping && !mapping->a_ops->migratepage) 1204 if (mapping && !mapping->a_ops->migratepage)
1205 return ret; 1205 return ret;
1206 } 1206 }
1207 } 1207 }
1208 1208
1209 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1209 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1210 return ret; 1210 return ret;
1211 1211
1212 if (likely(get_page_unless_zero(page))) { 1212 if (likely(get_page_unless_zero(page))) {
1213 /* 1213 /*
1214 * Be careful not to clear PageLRU until after we're 1214 * Be careful not to clear PageLRU until after we're
1215 * sure the page is not being freed elsewhere -- the 1215 * sure the page is not being freed elsewhere -- the
1216 * page release code relies on it. 1216 * page release code relies on it.
1217 */ 1217 */
1218 ClearPageLRU(page); 1218 ClearPageLRU(page);
1219 ret = 0; 1219 ret = 0;
1220 } 1220 }
1221 1221
1222 return ret; 1222 return ret;
1223 } 1223 }
1224 1224
1225 /* 1225 /*
1226 * zone->lru_lock is heavily contended. Some of the functions that 1226 * zone->lru_lock is heavily contended. Some of the functions that
1227 * shrink the lists perform better by taking out a batch of pages 1227 * shrink the lists perform better by taking out a batch of pages
1228 * and working on them outside the LRU lock. 1228 * and working on them outside the LRU lock.
1229 * 1229 *
1230 * For pagecache intensive workloads, this function is the hottest 1230 * For pagecache intensive workloads, this function is the hottest
1231 * spot in the kernel (apart from copy_*_user functions). 1231 * spot in the kernel (apart from copy_*_user functions).
1232 * 1232 *
1233 * Appropriate locks must be held before calling this function. 1233 * Appropriate locks must be held before calling this function.
1234 * 1234 *
1235 * @nr_to_scan: The number of pages to look through on the list. 1235 * @nr_to_scan: The number of pages to look through on the list.
1236 * @lruvec: The LRU vector to pull pages from. 1236 * @lruvec: The LRU vector to pull pages from.
1237 * @dst: The temp list to put pages on to. 1237 * @dst: The temp list to put pages on to.
1238 * @nr_scanned: The number of pages that were scanned. 1238 * @nr_scanned: The number of pages that were scanned.
1239 * @sc: The scan_control struct for this reclaim session 1239 * @sc: The scan_control struct for this reclaim session
1240 * @mode: One of the LRU isolation modes 1240 * @mode: One of the LRU isolation modes
1241 * @lru: LRU list id for isolating 1241 * @lru: LRU list id for isolating
1242 * 1242 *
1243 * returns how many pages were moved onto *@dst. 1243 * returns how many pages were moved onto *@dst.
1244 */ 1244 */
1245 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1245 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1246 struct lruvec *lruvec, struct list_head *dst, 1246 struct lruvec *lruvec, struct list_head *dst,
1247 unsigned long *nr_scanned, struct scan_control *sc, 1247 unsigned long *nr_scanned, struct scan_control *sc,
1248 isolate_mode_t mode, enum lru_list lru) 1248 isolate_mode_t mode, enum lru_list lru)
1249 { 1249 {
1250 struct list_head *src = &lruvec->lists[lru]; 1250 struct list_head *src = &lruvec->lists[lru];
1251 unsigned long nr_taken = 0; 1251 unsigned long nr_taken = 0;
1252 unsigned long scan; 1252 unsigned long scan;
1253 1253
1254 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1254 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1255 struct page *page; 1255 struct page *page;
1256 int nr_pages; 1256 int nr_pages;
1257 1257
1258 page = lru_to_page(src); 1258 page = lru_to_page(src);
1259 prefetchw_prev_lru_page(page, src, flags); 1259 prefetchw_prev_lru_page(page, src, flags);
1260 1260
1261 VM_BUG_ON(!PageLRU(page)); 1261 VM_BUG_ON(!PageLRU(page));
1262 1262
1263 switch (__isolate_lru_page(page, mode)) { 1263 switch (__isolate_lru_page(page, mode)) {
1264 case 0: 1264 case 0:
1265 nr_pages = hpage_nr_pages(page); 1265 nr_pages = hpage_nr_pages(page);
1266 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); 1266 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1267 list_move(&page->lru, dst); 1267 list_move(&page->lru, dst);
1268 nr_taken += nr_pages; 1268 nr_taken += nr_pages;
1269 break; 1269 break;
1270 1270
1271 case -EBUSY: 1271 case -EBUSY:
1272 /* else it is being freed elsewhere */ 1272 /* else it is being freed elsewhere */
1273 list_move(&page->lru, src); 1273 list_move(&page->lru, src);
1274 continue; 1274 continue;
1275 1275
1276 default: 1276 default:
1277 BUG(); 1277 BUG();
1278 } 1278 }
1279 } 1279 }
1280 1280
1281 *nr_scanned = scan; 1281 *nr_scanned = scan;
1282 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, 1282 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1283 nr_taken, mode, is_file_lru(lru)); 1283 nr_taken, mode, is_file_lru(lru));
1284 return nr_taken; 1284 return nr_taken;
1285 } 1285 }
1286 1286
1287 /** 1287 /**
1288 * isolate_lru_page - tries to isolate a page from its LRU list 1288 * isolate_lru_page - tries to isolate a page from its LRU list
1289 * @page: page to isolate from its LRU list 1289 * @page: page to isolate from its LRU list
1290 * 1290 *
1291 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1291 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1292 * vmstat statistic corresponding to whatever LRU list the page was on. 1292 * vmstat statistic corresponding to whatever LRU list the page was on.
1293 * 1293 *
1294 * Returns 0 if the page was removed from an LRU list. 1294 * Returns 0 if the page was removed from an LRU list.
1295 * Returns -EBUSY if the page was not on an LRU list. 1295 * Returns -EBUSY if the page was not on an LRU list.
1296 * 1296 *
1297 * The returned page will have PageLRU() cleared. If it was found on 1297 * The returned page will have PageLRU() cleared. If it was found on
1298 * the active list, it will have PageActive set. If it was found on 1298 * the active list, it will have PageActive set. If it was found on
1299 * the unevictable list, it will have the PageUnevictable bit set. That flag 1299 * the unevictable list, it will have the PageUnevictable bit set. That flag
1300 * may need to be cleared by the caller before letting the page go. 1300 * may need to be cleared by the caller before letting the page go.
1301 * 1301 *
1302 * The vmstat statistic corresponding to the list on which the page was 1302 * The vmstat statistic corresponding to the list on which the page was
1303 * found will be decremented. 1303 * found will be decremented.
1304 * 1304 *
1305 * Restrictions: 1305 * Restrictions:
1306 * (1) Must be called with an elevated refcount on the page. This is a 1306 * (1) Must be called with an elevated refcount on the page. This is a
1307 * fundamentnal difference from isolate_lru_pages (which is called 1307 * fundamentnal difference from isolate_lru_pages (which is called
1308 * without a stable reference). 1308 * without a stable reference).
1309 * (2) the lru_lock must not be held. 1309 * (2) the lru_lock must not be held.
1310 * (3) interrupts must be enabled. 1310 * (3) interrupts must be enabled.
1311 */ 1311 */
1312 int isolate_lru_page(struct page *page) 1312 int isolate_lru_page(struct page *page)
1313 { 1313 {
1314 int ret = -EBUSY; 1314 int ret = -EBUSY;
1315 1315
1316 VM_BUG_ON(!page_count(page)); 1316 VM_BUG_ON(!page_count(page));
1317 1317
1318 if (PageLRU(page)) { 1318 if (PageLRU(page)) {
1319 struct zone *zone = page_zone(page); 1319 struct zone *zone = page_zone(page);
1320 struct lruvec *lruvec; 1320 struct lruvec *lruvec;
1321 1321
1322 spin_lock_irq(&zone->lru_lock); 1322 spin_lock_irq(&zone->lru_lock);
1323 lruvec = mem_cgroup_page_lruvec(page, zone); 1323 lruvec = mem_cgroup_page_lruvec(page, zone);
1324 if (PageLRU(page)) { 1324 if (PageLRU(page)) {
1325 int lru = page_lru(page); 1325 int lru = page_lru(page);
1326 get_page(page); 1326 get_page(page);
1327 ClearPageLRU(page); 1327 ClearPageLRU(page);
1328 del_page_from_lru_list(page, lruvec, lru); 1328 del_page_from_lru_list(page, lruvec, lru);
1329 ret = 0; 1329 ret = 0;
1330 } 1330 }
1331 spin_unlock_irq(&zone->lru_lock); 1331 spin_unlock_irq(&zone->lru_lock);
1332 } 1332 }
1333 return ret; 1333 return ret;
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 1337 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1338 * then get resheduled. When there are massive number of tasks doing page 1338 * then get resheduled. When there are massive number of tasks doing page
1339 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 1339 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1340 * the LRU list will go small and be scanned faster than necessary, leading to 1340 * the LRU list will go small and be scanned faster than necessary, leading to
1341 * unnecessary swapping, thrashing and OOM. 1341 * unnecessary swapping, thrashing and OOM.
1342 */ 1342 */
1343 static int too_many_isolated(struct zone *zone, int file, 1343 static int too_many_isolated(struct zone *zone, int file,
1344 struct scan_control *sc) 1344 struct scan_control *sc)
1345 { 1345 {
1346 unsigned long inactive, isolated; 1346 unsigned long inactive, isolated;
1347 1347
1348 if (current_is_kswapd()) 1348 if (current_is_kswapd())
1349 return 0; 1349 return 0;
1350 1350
1351 if (!global_reclaim(sc)) 1351 if (!global_reclaim(sc))
1352 return 0; 1352 return 0;
1353 1353
1354 if (file) { 1354 if (file) {
1355 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1355 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1356 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1356 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1357 } else { 1357 } else {
1358 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1358 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1359 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1359 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1360 } 1360 }
1361 1361
1362 /* 1362 /*
1363 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 1363 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1364 * won't get blocked by normal direct-reclaimers, forming a circular 1364 * won't get blocked by normal direct-reclaimers, forming a circular
1365 * deadlock. 1365 * deadlock.
1366 */ 1366 */
1367 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) 1367 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1368 inactive >>= 3; 1368 inactive >>= 3;
1369 1369
1370 return isolated > inactive; 1370 return isolated > inactive;
1371 } 1371 }
1372 1372
1373 static noinline_for_stack void 1373 static noinline_for_stack void
1374 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1374 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1375 { 1375 {
1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1377 struct zone *zone = lruvec_zone(lruvec); 1377 struct zone *zone = lruvec_zone(lruvec);
1378 LIST_HEAD(pages_to_free); 1378 LIST_HEAD(pages_to_free);
1379 1379
1380 /* 1380 /*
1381 * Put back any unfreeable pages. 1381 * Put back any unfreeable pages.
1382 */ 1382 */
1383 while (!list_empty(page_list)) { 1383 while (!list_empty(page_list)) {
1384 struct page *page = lru_to_page(page_list); 1384 struct page *page = lru_to_page(page_list);
1385 int lru; 1385 int lru;
1386 1386
1387 VM_BUG_ON(PageLRU(page)); 1387 VM_BUG_ON(PageLRU(page));
1388 list_del(&page->lru); 1388 list_del(&page->lru);
1389 if (unlikely(!page_evictable(page))) { 1389 if (unlikely(!page_evictable(page))) {
1390 spin_unlock_irq(&zone->lru_lock); 1390 spin_unlock_irq(&zone->lru_lock);
1391 putback_lru_page(page); 1391 putback_lru_page(page);
1392 spin_lock_irq(&zone->lru_lock); 1392 spin_lock_irq(&zone->lru_lock);
1393 continue; 1393 continue;
1394 } 1394 }
1395 1395
1396 lruvec = mem_cgroup_page_lruvec(page, zone); 1396 lruvec = mem_cgroup_page_lruvec(page, zone);
1397 1397
1398 SetPageLRU(page); 1398 SetPageLRU(page);
1399 lru = page_lru(page); 1399 lru = page_lru(page);
1400 add_page_to_lru_list(page, lruvec, lru); 1400 add_page_to_lru_list(page, lruvec, lru);
1401 1401
1402 if (is_active_lru(lru)) { 1402 if (is_active_lru(lru)) {
1403 int file = is_file_lru(lru); 1403 int file = is_file_lru(lru);
1404 int numpages = hpage_nr_pages(page); 1404 int numpages = hpage_nr_pages(page);
1405 reclaim_stat->recent_rotated[file] += numpages; 1405 reclaim_stat->recent_rotated[file] += numpages;
1406 } 1406 }
1407 if (put_page_testzero(page)) { 1407 if (put_page_testzero(page)) {
1408 __ClearPageLRU(page); 1408 __ClearPageLRU(page);
1409 __ClearPageActive(page); 1409 __ClearPageActive(page);
1410 del_page_from_lru_list(page, lruvec, lru); 1410 del_page_from_lru_list(page, lruvec, lru);
1411 1411
1412 if (unlikely(PageCompound(page))) { 1412 if (unlikely(PageCompound(page))) {
1413 spin_unlock_irq(&zone->lru_lock); 1413 spin_unlock_irq(&zone->lru_lock);
1414 (*get_compound_page_dtor(page))(page); 1414 (*get_compound_page_dtor(page))(page);
1415 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1416 } else 1416 } else
1417 list_add(&page->lru, &pages_to_free); 1417 list_add(&page->lru, &pages_to_free);
1418 } 1418 }
1419 } 1419 }
1420 1420
1421 /* 1421 /*
1422 * To save our caller's stack, now use input list for pages to free. 1422 * To save our caller's stack, now use input list for pages to free.
1423 */ 1423 */
1424 list_splice(&pages_to_free, page_list); 1424 list_splice(&pages_to_free, page_list);
1425 } 1425 }
1426 1426
1427 /* 1427 /*
1428 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1428 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1429 * of reclaimed pages 1429 * of reclaimed pages
1430 */ 1430 */
1431 static noinline_for_stack unsigned long 1431 static noinline_for_stack unsigned long
1432 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, 1432 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1433 struct scan_control *sc, enum lru_list lru) 1433 struct scan_control *sc, enum lru_list lru)
1434 { 1434 {
1435 LIST_HEAD(page_list); 1435 LIST_HEAD(page_list);
1436 unsigned long nr_scanned; 1436 unsigned long nr_scanned;
1437 unsigned long nr_reclaimed = 0; 1437 unsigned long nr_reclaimed = 0;
1438 unsigned long nr_taken; 1438 unsigned long nr_taken;
1439 unsigned long nr_dirty = 0; 1439 unsigned long nr_dirty = 0;
1440 unsigned long nr_congested = 0; 1440 unsigned long nr_congested = 0;
1441 unsigned long nr_unqueued_dirty = 0; 1441 unsigned long nr_unqueued_dirty = 0;
1442 unsigned long nr_writeback = 0; 1442 unsigned long nr_writeback = 0;
1443 unsigned long nr_immediate = 0; 1443 unsigned long nr_immediate = 0;
1444 isolate_mode_t isolate_mode = 0; 1444 isolate_mode_t isolate_mode = 0;
1445 int file = is_file_lru(lru); 1445 int file = is_file_lru(lru);
1446 struct zone *zone = lruvec_zone(lruvec); 1446 struct zone *zone = lruvec_zone(lruvec);
1447 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1447 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1448 1448
1449 while (unlikely(too_many_isolated(zone, file, sc))) { 1449 while (unlikely(too_many_isolated(zone, file, sc))) {
1450 congestion_wait(BLK_RW_ASYNC, HZ/10); 1450 congestion_wait(BLK_RW_ASYNC, HZ/10);
1451 1451
1452 /* We are about to die and free our memory. Return now. */ 1452 /* We are about to die and free our memory. Return now. */
1453 if (fatal_signal_pending(current)) 1453 if (fatal_signal_pending(current))
1454 return SWAP_CLUSTER_MAX; 1454 return SWAP_CLUSTER_MAX;
1455 } 1455 }
1456 1456
1457 lru_add_drain(); 1457 lru_add_drain();
1458 1458
1459 if (!sc->may_unmap) 1459 if (!sc->may_unmap)
1460 isolate_mode |= ISOLATE_UNMAPPED; 1460 isolate_mode |= ISOLATE_UNMAPPED;
1461 if (!sc->may_writepage) 1461 if (!sc->may_writepage)
1462 isolate_mode |= ISOLATE_CLEAN; 1462 isolate_mode |= ISOLATE_CLEAN;
1463 1463
1464 spin_lock_irq(&zone->lru_lock); 1464 spin_lock_irq(&zone->lru_lock);
1465 1465
1466 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1466 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1467 &nr_scanned, sc, isolate_mode, lru); 1467 &nr_scanned, sc, isolate_mode, lru);
1468 1468
1469 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1469 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1470 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1470 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1471 1471
1472 if (global_reclaim(sc)) { 1472 if (global_reclaim(sc)) {
1473 zone->pages_scanned += nr_scanned; 1473 zone->pages_scanned += nr_scanned;
1474 if (current_is_kswapd()) 1474 if (current_is_kswapd())
1475 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1475 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1476 else 1476 else
1477 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); 1477 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1478 } 1478 }
1479 spin_unlock_irq(&zone->lru_lock); 1479 spin_unlock_irq(&zone->lru_lock);
1480 1480
1481 if (nr_taken == 0) 1481 if (nr_taken == 0)
1482 return 0; 1482 return 0;
1483 1483
1484 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1484 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1485 &nr_dirty, &nr_unqueued_dirty, &nr_congested, 1485 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1486 &nr_writeback, &nr_immediate, 1486 &nr_writeback, &nr_immediate,
1487 false); 1487 false);
1488 1488
1489 spin_lock_irq(&zone->lru_lock); 1489 spin_lock_irq(&zone->lru_lock);
1490 1490
1491 reclaim_stat->recent_scanned[file] += nr_taken; 1491 reclaim_stat->recent_scanned[file] += nr_taken;
1492 1492
1493 if (global_reclaim(sc)) { 1493 if (global_reclaim(sc)) {
1494 if (current_is_kswapd()) 1494 if (current_is_kswapd())
1495 __count_zone_vm_events(PGSTEAL_KSWAPD, zone, 1495 __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1496 nr_reclaimed); 1496 nr_reclaimed);
1497 else 1497 else
1498 __count_zone_vm_events(PGSTEAL_DIRECT, zone, 1498 __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1499 nr_reclaimed); 1499 nr_reclaimed);
1500 } 1500 }
1501 1501
1502 putback_inactive_pages(lruvec, &page_list); 1502 putback_inactive_pages(lruvec, &page_list);
1503 1503
1504 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1504 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1505 1505
1506 spin_unlock_irq(&zone->lru_lock); 1506 spin_unlock_irq(&zone->lru_lock);
1507 1507
1508 free_hot_cold_page_list(&page_list, true); 1508 free_hot_cold_page_list(&page_list, true);
1509 1509
1510 /* 1510 /*
1511 * If reclaim is isolating dirty pages under writeback, it implies 1511 * If reclaim is isolating dirty pages under writeback, it implies
1512 * that the long-lived page allocation rate is exceeding the page 1512 * that the long-lived page allocation rate is exceeding the page
1513 * laundering rate. Either the global limits are not being effective 1513 * laundering rate. Either the global limits are not being effective
1514 * at throttling processes due to the page distribution throughout 1514 * at throttling processes due to the page distribution throughout
1515 * zones or there is heavy usage of a slow backing device. The 1515 * zones or there is heavy usage of a slow backing device. The
1516 * only option is to throttle from reclaim context which is not ideal 1516 * only option is to throttle from reclaim context which is not ideal
1517 * as there is no guarantee the dirtying process is throttled in the 1517 * as there is no guarantee the dirtying process is throttled in the
1518 * same way balance_dirty_pages() manages. 1518 * same way balance_dirty_pages() manages.
1519 * 1519 *
1520 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number 1520 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1521 * of pages under pages flagged for immediate reclaim and stall if any 1521 * of pages under pages flagged for immediate reclaim and stall if any
1522 * are encountered in the nr_immediate check below. 1522 * are encountered in the nr_immediate check below.
1523 */ 1523 */
1524 if (nr_writeback && nr_writeback == nr_taken) 1524 if (nr_writeback && nr_writeback == nr_taken)
1525 zone_set_flag(zone, ZONE_WRITEBACK); 1525 zone_set_flag(zone, ZONE_WRITEBACK);
1526 1526
1527 /* 1527 /*
1528 * memcg will stall in page writeback so only consider forcibly 1528 * memcg will stall in page writeback so only consider forcibly
1529 * stalling for global reclaim 1529 * stalling for global reclaim
1530 */ 1530 */
1531 if (global_reclaim(sc)) { 1531 if (global_reclaim(sc)) {
1532 /* 1532 /*
1533 * Tag a zone as congested if all the dirty pages scanned were 1533 * Tag a zone as congested if all the dirty pages scanned were
1534 * backed by a congested BDI and wait_iff_congested will stall. 1534 * backed by a congested BDI and wait_iff_congested will stall.
1535 */ 1535 */
1536 if (nr_dirty && nr_dirty == nr_congested) 1536 if (nr_dirty && nr_dirty == nr_congested)
1537 zone_set_flag(zone, ZONE_CONGESTED); 1537 zone_set_flag(zone, ZONE_CONGESTED);
1538 1538
1539 /* 1539 /*
1540 * If dirty pages are scanned that are not queued for IO, it 1540 * If dirty pages are scanned that are not queued for IO, it
1541 * implies that flushers are not keeping up. In this case, flag 1541 * implies that flushers are not keeping up. In this case, flag
1542 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing 1542 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1543 * pages from reclaim context. 1543 * pages from reclaim context.
1544 */ 1544 */
1545 if (nr_unqueued_dirty == nr_taken) 1545 if (nr_unqueued_dirty == nr_taken)
1546 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); 1546 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1547 1547
1548 /* 1548 /*
1549 * If kswapd scans pages marked marked for immediate 1549 * If kswapd scans pages marked marked for immediate
1550 * reclaim and under writeback (nr_immediate), it implies 1550 * reclaim and under writeback (nr_immediate), it implies
1551 * that pages are cycling through the LRU faster than 1551 * that pages are cycling through the LRU faster than
1552 * they are written so also forcibly stall. 1552 * they are written so also forcibly stall.
1553 */ 1553 */
1554 if (nr_immediate) 1554 if (nr_immediate)
1555 congestion_wait(BLK_RW_ASYNC, HZ/10); 1555 congestion_wait(BLK_RW_ASYNC, HZ/10);
1556 } 1556 }
1557 1557
1558 /* 1558 /*
1559 * Stall direct reclaim for IO completions if underlying BDIs or zone 1559 * Stall direct reclaim for IO completions if underlying BDIs or zone
1560 * is congested. Allow kswapd to continue until it starts encountering 1560 * is congested. Allow kswapd to continue until it starts encountering
1561 * unqueued dirty pages or cycling through the LRU too quickly. 1561 * unqueued dirty pages or cycling through the LRU too quickly.
1562 */ 1562 */
1563 if (!sc->hibernation_mode && !current_is_kswapd()) 1563 if (!sc->hibernation_mode && !current_is_kswapd())
1564 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1564 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1565 1565
1566 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1566 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1567 zone_idx(zone), 1567 zone_idx(zone),
1568 nr_scanned, nr_reclaimed, 1568 nr_scanned, nr_reclaimed,
1569 sc->priority, 1569 sc->priority,
1570 trace_shrink_flags(file)); 1570 trace_shrink_flags(file));
1571 return nr_reclaimed; 1571 return nr_reclaimed;
1572 } 1572 }
1573 1573
1574 /* 1574 /*
1575 * This moves pages from the active list to the inactive list. 1575 * This moves pages from the active list to the inactive list.
1576 * 1576 *
1577 * We move them the other way if the page is referenced by one or more 1577 * We move them the other way if the page is referenced by one or more
1578 * processes, from rmap. 1578 * processes, from rmap.
1579 * 1579 *
1580 * If the pages are mostly unmapped, the processing is fast and it is 1580 * If the pages are mostly unmapped, the processing is fast and it is
1581 * appropriate to hold zone->lru_lock across the whole operation. But if 1581 * appropriate to hold zone->lru_lock across the whole operation. But if
1582 * the pages are mapped, the processing is slow (page_referenced()) so we 1582 * the pages are mapped, the processing is slow (page_referenced()) so we
1583 * should drop zone->lru_lock around each page. It's impossible to balance 1583 * should drop zone->lru_lock around each page. It's impossible to balance
1584 * this, so instead we remove the pages from the LRU while processing them. 1584 * this, so instead we remove the pages from the LRU while processing them.
1585 * It is safe to rely on PG_active against the non-LRU pages in here because 1585 * It is safe to rely on PG_active against the non-LRU pages in here because
1586 * nobody will play with that bit on a non-LRU page. 1586 * nobody will play with that bit on a non-LRU page.
1587 * 1587 *
1588 * The downside is that we have to touch page->_count against each page. 1588 * The downside is that we have to touch page->_count against each page.
1589 * But we had to alter page->flags anyway. 1589 * But we had to alter page->flags anyway.
1590 */ 1590 */
1591 1591
1592 static void move_active_pages_to_lru(struct lruvec *lruvec, 1592 static void move_active_pages_to_lru(struct lruvec *lruvec,
1593 struct list_head *list, 1593 struct list_head *list,
1594 struct list_head *pages_to_free, 1594 struct list_head *pages_to_free,
1595 enum lru_list lru) 1595 enum lru_list lru)
1596 { 1596 {
1597 struct zone *zone = lruvec_zone(lruvec); 1597 struct zone *zone = lruvec_zone(lruvec);
1598 unsigned long pgmoved = 0; 1598 unsigned long pgmoved = 0;
1599 struct page *page; 1599 struct page *page;
1600 int nr_pages; 1600 int nr_pages;
1601 1601
1602 while (!list_empty(list)) { 1602 while (!list_empty(list)) {
1603 page = lru_to_page(list); 1603 page = lru_to_page(list);
1604 lruvec = mem_cgroup_page_lruvec(page, zone); 1604 lruvec = mem_cgroup_page_lruvec(page, zone);
1605 1605
1606 VM_BUG_ON(PageLRU(page)); 1606 VM_BUG_ON(PageLRU(page));
1607 SetPageLRU(page); 1607 SetPageLRU(page);
1608 1608
1609 nr_pages = hpage_nr_pages(page); 1609 nr_pages = hpage_nr_pages(page);
1610 mem_cgroup_update_lru_size(lruvec, lru, nr_pages); 1610 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1611 list_move(&page->lru, &lruvec->lists[lru]); 1611 list_move(&page->lru, &lruvec->lists[lru]);
1612 pgmoved += nr_pages; 1612 pgmoved += nr_pages;
1613 1613
1614 if (put_page_testzero(page)) { 1614 if (put_page_testzero(page)) {
1615 __ClearPageLRU(page); 1615 __ClearPageLRU(page);
1616 __ClearPageActive(page); 1616 __ClearPageActive(page);
1617 del_page_from_lru_list(page, lruvec, lru); 1617 del_page_from_lru_list(page, lruvec, lru);
1618 1618
1619 if (unlikely(PageCompound(page))) { 1619 if (unlikely(PageCompound(page))) {
1620 spin_unlock_irq(&zone->lru_lock); 1620 spin_unlock_irq(&zone->lru_lock);
1621 (*get_compound_page_dtor(page))(page); 1621 (*get_compound_page_dtor(page))(page);
1622 spin_lock_irq(&zone->lru_lock); 1622 spin_lock_irq(&zone->lru_lock);
1623 } else 1623 } else
1624 list_add(&page->lru, pages_to_free); 1624 list_add(&page->lru, pages_to_free);
1625 } 1625 }
1626 } 1626 }
1627 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1627 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1628 if (!is_active_lru(lru)) 1628 if (!is_active_lru(lru))
1629 __count_vm_events(PGDEACTIVATE, pgmoved); 1629 __count_vm_events(PGDEACTIVATE, pgmoved);
1630 } 1630 }
1631 1631
1632 static void shrink_active_list(unsigned long nr_to_scan, 1632 static void shrink_active_list(unsigned long nr_to_scan,
1633 struct lruvec *lruvec, 1633 struct lruvec *lruvec,
1634 struct scan_control *sc, 1634 struct scan_control *sc,
1635 enum lru_list lru) 1635 enum lru_list lru)
1636 { 1636 {
1637 unsigned long nr_taken; 1637 unsigned long nr_taken;
1638 unsigned long nr_scanned; 1638 unsigned long nr_scanned;
1639 unsigned long vm_flags; 1639 unsigned long vm_flags;
1640 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1640 LIST_HEAD(l_hold); /* The pages which were snipped off */
1641 LIST_HEAD(l_active); 1641 LIST_HEAD(l_active);
1642 LIST_HEAD(l_inactive); 1642 LIST_HEAD(l_inactive);
1643 struct page *page; 1643 struct page *page;
1644 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1644 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1645 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t isolate_mode = 0; 1646 isolate_mode_t isolate_mode = 0;
1647 int file = is_file_lru(lru); 1647 int file = is_file_lru(lru);
1648 struct zone *zone = lruvec_zone(lruvec); 1648 struct zone *zone = lruvec_zone(lruvec);
1649 1649
1650 lru_add_drain(); 1650 lru_add_drain();
1651 1651
1652 if (!sc->may_unmap) 1652 if (!sc->may_unmap)
1653 isolate_mode |= ISOLATE_UNMAPPED; 1653 isolate_mode |= ISOLATE_UNMAPPED;
1654 if (!sc->may_writepage) 1654 if (!sc->may_writepage)
1655 isolate_mode |= ISOLATE_CLEAN; 1655 isolate_mode |= ISOLATE_CLEAN;
1656 1656
1657 spin_lock_irq(&zone->lru_lock); 1657 spin_lock_irq(&zone->lru_lock);
1658 1658
1659 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1659 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1660 &nr_scanned, sc, isolate_mode, lru); 1660 &nr_scanned, sc, isolate_mode, lru);
1661 if (global_reclaim(sc)) 1661 if (global_reclaim(sc))
1662 zone->pages_scanned += nr_scanned; 1662 zone->pages_scanned += nr_scanned;
1663 1663
1664 reclaim_stat->recent_scanned[file] += nr_taken; 1664 reclaim_stat->recent_scanned[file] += nr_taken;
1665 1665
1666 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1666 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1667 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); 1667 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1668 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1668 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1669 spin_unlock_irq(&zone->lru_lock); 1669 spin_unlock_irq(&zone->lru_lock);
1670 1670
1671 while (!list_empty(&l_hold)) { 1671 while (!list_empty(&l_hold)) {
1672 cond_resched(); 1672 cond_resched();
1673 page = lru_to_page(&l_hold); 1673 page = lru_to_page(&l_hold);
1674 list_del(&page->lru); 1674 list_del(&page->lru);
1675 1675
1676 if (unlikely(!page_evictable(page))) { 1676 if (unlikely(!page_evictable(page))) {
1677 putback_lru_page(page); 1677 putback_lru_page(page);
1678 continue; 1678 continue;
1679 } 1679 }
1680 1680
1681 if (unlikely(buffer_heads_over_limit)) { 1681 if (unlikely(buffer_heads_over_limit)) {
1682 if (page_has_private(page) && trylock_page(page)) { 1682 if (page_has_private(page) && trylock_page(page)) {
1683 if (page_has_private(page)) 1683 if (page_has_private(page))
1684 try_to_release_page(page, 0); 1684 try_to_release_page(page, 0);
1685 unlock_page(page); 1685 unlock_page(page);
1686 } 1686 }
1687 } 1687 }
1688 1688
1689 if (page_referenced(page, 0, sc->target_mem_cgroup, 1689 if (page_referenced(page, 0, sc->target_mem_cgroup,
1690 &vm_flags)) { 1690 &vm_flags)) {
1691 nr_rotated += hpage_nr_pages(page); 1691 nr_rotated += hpage_nr_pages(page);
1692 /* 1692 /*
1693 * Identify referenced, file-backed active pages and 1693 * Identify referenced, file-backed active pages and
1694 * give them one more trip around the active list. So 1694 * give them one more trip around the active list. So
1695 * that executable code get better chances to stay in 1695 * that executable code get better chances to stay in
1696 * memory under moderate memory pressure. Anon pages 1696 * memory under moderate memory pressure. Anon pages
1697 * are not likely to be evicted by use-once streaming 1697 * are not likely to be evicted by use-once streaming
1698 * IO, plus JVM can create lots of anon VM_EXEC pages, 1698 * IO, plus JVM can create lots of anon VM_EXEC pages,
1699 * so we ignore them here. 1699 * so we ignore them here.
1700 */ 1700 */
1701 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1701 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1702 list_add(&page->lru, &l_active); 1702 list_add(&page->lru, &l_active);
1703 continue; 1703 continue;
1704 } 1704 }
1705 } 1705 }
1706 1706
1707 ClearPageActive(page); /* we are de-activating */ 1707 ClearPageActive(page); /* we are de-activating */
1708 list_add(&page->lru, &l_inactive); 1708 list_add(&page->lru, &l_inactive);
1709 } 1709 }
1710 1710
1711 /* 1711 /*
1712 * Move pages back to the lru list. 1712 * Move pages back to the lru list.
1713 */ 1713 */
1714 spin_lock_irq(&zone->lru_lock); 1714 spin_lock_irq(&zone->lru_lock);
1715 /* 1715 /*
1716 * Count referenced pages from currently used mappings as rotated, 1716 * Count referenced pages from currently used mappings as rotated,
1717 * even though only some of them are actually re-activated. This 1717 * even though only some of them are actually re-activated. This
1718 * helps balance scan pressure between file and anonymous pages in 1718 * helps balance scan pressure between file and anonymous pages in
1719 * get_scan_ratio. 1719 * get_scan_ratio.
1720 */ 1720 */
1721 reclaim_stat->recent_rotated[file] += nr_rotated; 1721 reclaim_stat->recent_rotated[file] += nr_rotated;
1722 1722
1723 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 1723 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1724 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 1724 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1725 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1725 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1726 spin_unlock_irq(&zone->lru_lock); 1726 spin_unlock_irq(&zone->lru_lock);
1727 1727
1728 free_hot_cold_page_list(&l_hold, true); 1728 free_hot_cold_page_list(&l_hold, true);
1729 } 1729 }
1730 1730
1731 #ifdef CONFIG_SWAP 1731 #ifdef CONFIG_SWAP
1732 static int inactive_anon_is_low_global(struct zone *zone) 1732 static int inactive_anon_is_low_global(struct zone *zone)
1733 { 1733 {
1734 unsigned long active, inactive; 1734 unsigned long active, inactive;
1735 1735
1736 active = zone_page_state(zone, NR_ACTIVE_ANON); 1736 active = zone_page_state(zone, NR_ACTIVE_ANON);
1737 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1737 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1738 1738
1739 if (inactive * zone->inactive_ratio < active) 1739 if (inactive * zone->inactive_ratio < active)
1740 return 1; 1740 return 1;
1741 1741
1742 return 0; 1742 return 0;
1743 } 1743 }
1744 1744
1745 /** 1745 /**
1746 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1746 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1747 * @lruvec: LRU vector to check 1747 * @lruvec: LRU vector to check
1748 * 1748 *
1749 * Returns true if the zone does not have enough inactive anon pages, 1749 * Returns true if the zone does not have enough inactive anon pages,
1750 * meaning some active anon pages need to be deactivated. 1750 * meaning some active anon pages need to be deactivated.
1751 */ 1751 */
1752 static int inactive_anon_is_low(struct lruvec *lruvec) 1752 static int inactive_anon_is_low(struct lruvec *lruvec)
1753 { 1753 {
1754 /* 1754 /*
1755 * If we don't have swap space, anonymous page deactivation 1755 * If we don't have swap space, anonymous page deactivation
1756 * is pointless. 1756 * is pointless.
1757 */ 1757 */
1758 if (!total_swap_pages) 1758 if (!total_swap_pages)
1759 return 0; 1759 return 0;
1760 1760
1761 if (!mem_cgroup_disabled()) 1761 if (!mem_cgroup_disabled())
1762 return mem_cgroup_inactive_anon_is_low(lruvec); 1762 return mem_cgroup_inactive_anon_is_low(lruvec);
1763 1763
1764 return inactive_anon_is_low_global(lruvec_zone(lruvec)); 1764 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1765 } 1765 }
1766 #else 1766 #else
1767 static inline int inactive_anon_is_low(struct lruvec *lruvec) 1767 static inline int inactive_anon_is_low(struct lruvec *lruvec)
1768 { 1768 {
1769 return 0; 1769 return 0;
1770 } 1770 }
1771 #endif 1771 #endif
1772 1772
1773 /** 1773 /**
1774 * inactive_file_is_low - check if file pages need to be deactivated 1774 * inactive_file_is_low - check if file pages need to be deactivated
1775 * @lruvec: LRU vector to check 1775 * @lruvec: LRU vector to check
1776 * 1776 *
1777 * When the system is doing streaming IO, memory pressure here 1777 * When the system is doing streaming IO, memory pressure here
1778 * ensures that active file pages get deactivated, until more 1778 * ensures that active file pages get deactivated, until more
1779 * than half of the file pages are on the inactive list. 1779 * than half of the file pages are on the inactive list.
1780 * 1780 *
1781 * Once we get to that situation, protect the system's working 1781 * Once we get to that situation, protect the system's working
1782 * set from being evicted by disabling active file page aging. 1782 * set from being evicted by disabling active file page aging.
1783 * 1783 *
1784 * This uses a different ratio than the anonymous pages, because 1784 * This uses a different ratio than the anonymous pages, because
1785 * the page cache uses a use-once replacement algorithm. 1785 * the page cache uses a use-once replacement algorithm.
1786 */ 1786 */
1787 static int inactive_file_is_low(struct lruvec *lruvec) 1787 static int inactive_file_is_low(struct lruvec *lruvec)
1788 { 1788 {
1789 unsigned long inactive; 1789 unsigned long inactive;
1790 unsigned long active; 1790 unsigned long active;
1791 1791
1792 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1792 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1793 active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1793 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1794 1794
1795 return active > inactive; 1795 return active > inactive;
1796 } 1796 }
1797 1797
1798 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1798 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1799 { 1799 {
1800 if (is_file_lru(lru)) 1800 if (is_file_lru(lru))
1801 return inactive_file_is_low(lruvec); 1801 return inactive_file_is_low(lruvec);
1802 else 1802 else
1803 return inactive_anon_is_low(lruvec); 1803 return inactive_anon_is_low(lruvec);
1804 } 1804 }
1805 1805
1806 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1806 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1807 struct lruvec *lruvec, struct scan_control *sc) 1807 struct lruvec *lruvec, struct scan_control *sc)
1808 { 1808 {
1809 if (is_active_lru(lru)) { 1809 if (is_active_lru(lru)) {
1810 if (inactive_list_is_low(lruvec, lru)) 1810 if (inactive_list_is_low(lruvec, lru))
1811 shrink_active_list(nr_to_scan, lruvec, sc, lru); 1811 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1812 return 0; 1812 return 0;
1813 } 1813 }
1814 1814
1815 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 1815 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1816 } 1816 }
1817 1817
1818 static int vmscan_swappiness(struct scan_control *sc) 1818 static int vmscan_swappiness(struct scan_control *sc)
1819 { 1819 {
1820 if (global_reclaim(sc)) 1820 if (global_reclaim(sc))
1821 return vm_swappiness; 1821 return vm_swappiness;
1822 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1822 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1823 } 1823 }
1824 1824
1825 enum scan_balance { 1825 enum scan_balance {
1826 SCAN_EQUAL, 1826 SCAN_EQUAL,
1827 SCAN_FRACT, 1827 SCAN_FRACT,
1828 SCAN_ANON, 1828 SCAN_ANON,
1829 SCAN_FILE, 1829 SCAN_FILE,
1830 }; 1830 };
1831 1831
1832 /* 1832 /*
1833 * Determine how aggressively the anon and file LRU lists should be 1833 * Determine how aggressively the anon and file LRU lists should be
1834 * scanned. The relative value of each set of LRU lists is determined 1834 * scanned. The relative value of each set of LRU lists is determined
1835 * by looking at the fraction of the pages scanned we did rotate back 1835 * by looking at the fraction of the pages scanned we did rotate back
1836 * onto the active list instead of evict. 1836 * onto the active list instead of evict.
1837 * 1837 *
1838 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 1838 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1839 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1839 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1840 */ 1840 */
1841 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1841 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1842 unsigned long *nr) 1842 unsigned long *nr)
1843 { 1843 {
1844 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1844 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1845 u64 fraction[2]; 1845 u64 fraction[2];
1846 u64 denominator = 0; /* gcc */ 1846 u64 denominator = 0; /* gcc */
1847 struct zone *zone = lruvec_zone(lruvec); 1847 struct zone *zone = lruvec_zone(lruvec);
1848 unsigned long anon_prio, file_prio; 1848 unsigned long anon_prio, file_prio;
1849 enum scan_balance scan_balance; 1849 enum scan_balance scan_balance;
1850 unsigned long anon, file, free; 1850 unsigned long anon, file;
1851 bool force_scan = false; 1851 bool force_scan = false;
1852 unsigned long ap, fp; 1852 unsigned long ap, fp;
1853 enum lru_list lru; 1853 enum lru_list lru;
1854 1854
1855 /* 1855 /*
1856 * If the zone or memcg is small, nr[l] can be 0. This 1856 * If the zone or memcg is small, nr[l] can be 0. This
1857 * results in no scanning on this priority and a potential 1857 * results in no scanning on this priority and a potential
1858 * priority drop. Global direct reclaim can go to the next 1858 * priority drop. Global direct reclaim can go to the next
1859 * zone and tends to have no problems. Global kswapd is for 1859 * zone and tends to have no problems. Global kswapd is for
1860 * zone balancing and it needs to scan a minimum amount. When 1860 * zone balancing and it needs to scan a minimum amount. When
1861 * reclaiming for a memcg, a priority drop can cause high 1861 * reclaiming for a memcg, a priority drop can cause high
1862 * latencies, so it's better to scan a minimum amount there as 1862 * latencies, so it's better to scan a minimum amount there as
1863 * well. 1863 * well.
1864 */ 1864 */
1865 if (current_is_kswapd() && !zone_reclaimable(zone)) 1865 if (current_is_kswapd() && !zone_reclaimable(zone))
1866 force_scan = true; 1866 force_scan = true;
1867 if (!global_reclaim(sc)) 1867 if (!global_reclaim(sc))
1868 force_scan = true; 1868 force_scan = true;
1869 1869
1870 /* If we have no swap space, do not bother scanning anon pages. */ 1870 /* If we have no swap space, do not bother scanning anon pages. */
1871 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { 1871 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1872 scan_balance = SCAN_FILE; 1872 scan_balance = SCAN_FILE;
1873 goto out; 1873 goto out;
1874 } 1874 }
1875 1875
1876 /* 1876 /*
1877 * Global reclaim will swap to prevent OOM even with no 1877 * Global reclaim will swap to prevent OOM even with no
1878 * swappiness, but memcg users want to use this knob to 1878 * swappiness, but memcg users want to use this knob to
1879 * disable swapping for individual groups completely when 1879 * disable swapping for individual groups completely when
1880 * using the memory controller's swap limit feature would be 1880 * using the memory controller's swap limit feature would be
1881 * too expensive. 1881 * too expensive.
1882 */ 1882 */
1883 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { 1883 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1884 scan_balance = SCAN_FILE; 1884 scan_balance = SCAN_FILE;
1885 goto out; 1885 goto out;
1886 } 1886 }
1887 1887
1888 /* 1888 /*
1889 * Do not apply any pressure balancing cleverness when the 1889 * Do not apply any pressure balancing cleverness when the
1890 * system is close to OOM, scan both anon and file equally 1890 * system is close to OOM, scan both anon and file equally
1891 * (unless the swappiness setting disagrees with swapping). 1891 * (unless the swappiness setting disagrees with swapping).
1892 */ 1892 */
1893 if (!sc->priority && vmscan_swappiness(sc)) { 1893 if (!sc->priority && vmscan_swappiness(sc)) {
1894 scan_balance = SCAN_EQUAL; 1894 scan_balance = SCAN_EQUAL;
1895 goto out; 1895 goto out;
1896 } 1896 }
1897 1897
1898 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1899 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1900 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1901 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1902
1903 /* 1898 /*
1904 * If it's foreseeable that reclaiming the file cache won't be 1899 * If it's foreseeable that reclaiming the file cache won't be
1905 * enough to get the zone back into a desirable shape, we have 1900 * enough to get the zone back into a desirable shape, we have
1906 * to swap. Better start now and leave the - probably heavily 1901 * to swap. Better start now and leave the - probably heavily
1907 * thrashing - remaining file pages alone. 1902 * thrashing - remaining file pages alone.
1908 */ 1903 */
1909 if (global_reclaim(sc)) { 1904 if (global_reclaim(sc)) {
1910 free = zone_page_state(zone, NR_FREE_PAGES); 1905 unsigned long zonefile;
1911 if (unlikely(file + free <= high_wmark_pages(zone))) { 1906 unsigned long zonefree;
1907
1908 zonefree = zone_page_state(zone, NR_FREE_PAGES);
1909 zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
1910 zone_page_state(zone, NR_INACTIVE_FILE);
1911
1912 if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
1912 scan_balance = SCAN_ANON; 1913 scan_balance = SCAN_ANON;
1913 goto out; 1914 goto out;
1914 } 1915 }
1915 } 1916 }
1916 1917
1917 /* 1918 /*
1918 * There is enough inactive page cache, do not reclaim 1919 * There is enough inactive page cache, do not reclaim
1919 * anything from the anonymous working set right now. 1920 * anything from the anonymous working set right now.
1920 */ 1921 */
1921 if (!inactive_file_is_low(lruvec)) { 1922 if (!inactive_file_is_low(lruvec)) {
1922 scan_balance = SCAN_FILE; 1923 scan_balance = SCAN_FILE;
1923 goto out; 1924 goto out;
1924 } 1925 }
1925 1926
1926 scan_balance = SCAN_FRACT; 1927 scan_balance = SCAN_FRACT;
1927 1928
1928 /* 1929 /*
1929 * With swappiness at 100, anonymous and file have the same priority. 1930 * With swappiness at 100, anonymous and file have the same priority.
1930 * This scanning priority is essentially the inverse of IO cost. 1931 * This scanning priority is essentially the inverse of IO cost.
1931 */ 1932 */
1932 anon_prio = vmscan_swappiness(sc); 1933 anon_prio = vmscan_swappiness(sc);
1933 file_prio = 200 - anon_prio; 1934 file_prio = 200 - anon_prio;
1934 1935
1935 /* 1936 /*
1936 * OK, so we have swap space and a fair amount of page cache 1937 * OK, so we have swap space and a fair amount of page cache
1937 * pages. We use the recently rotated / recently scanned 1938 * pages. We use the recently rotated / recently scanned
1938 * ratios to determine how valuable each cache is. 1939 * ratios to determine how valuable each cache is.
1939 * 1940 *
1940 * Because workloads change over time (and to avoid overflow) 1941 * Because workloads change over time (and to avoid overflow)
1941 * we keep these statistics as a floating average, which ends 1942 * we keep these statistics as a floating average, which ends
1942 * up weighing recent references more than old ones. 1943 * up weighing recent references more than old ones.
1943 * 1944 *
1944 * anon in [0], file in [1] 1945 * anon in [0], file in [1]
1945 */ 1946 */
1947
1948 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1949 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1950 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1951 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1952
1946 spin_lock_irq(&zone->lru_lock); 1953 spin_lock_irq(&zone->lru_lock);
1947 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1954 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1948 reclaim_stat->recent_scanned[0] /= 2; 1955 reclaim_stat->recent_scanned[0] /= 2;
1949 reclaim_stat->recent_rotated[0] /= 2; 1956 reclaim_stat->recent_rotated[0] /= 2;
1950 } 1957 }
1951 1958
1952 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1959 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1953 reclaim_stat->recent_scanned[1] /= 2; 1960 reclaim_stat->recent_scanned[1] /= 2;
1954 reclaim_stat->recent_rotated[1] /= 2; 1961 reclaim_stat->recent_rotated[1] /= 2;
1955 } 1962 }
1956 1963
1957 /* 1964 /*
1958 * The amount of pressure on anon vs file pages is inversely 1965 * The amount of pressure on anon vs file pages is inversely
1959 * proportional to the fraction of recently scanned pages on 1966 * proportional to the fraction of recently scanned pages on
1960 * each list that were recently referenced and in active use. 1967 * each list that were recently referenced and in active use.
1961 */ 1968 */
1962 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); 1969 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1963 ap /= reclaim_stat->recent_rotated[0] + 1; 1970 ap /= reclaim_stat->recent_rotated[0] + 1;
1964 1971
1965 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 1972 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1966 fp /= reclaim_stat->recent_rotated[1] + 1; 1973 fp /= reclaim_stat->recent_rotated[1] + 1;
1967 spin_unlock_irq(&zone->lru_lock); 1974 spin_unlock_irq(&zone->lru_lock);
1968 1975
1969 fraction[0] = ap; 1976 fraction[0] = ap;
1970 fraction[1] = fp; 1977 fraction[1] = fp;
1971 denominator = ap + fp + 1; 1978 denominator = ap + fp + 1;
1972 out: 1979 out:
1973 for_each_evictable_lru(lru) { 1980 for_each_evictable_lru(lru) {
1974 int file = is_file_lru(lru); 1981 int file = is_file_lru(lru);
1975 unsigned long size; 1982 unsigned long size;
1976 unsigned long scan; 1983 unsigned long scan;
1977 1984
1978 size = get_lru_size(lruvec, lru); 1985 size = get_lru_size(lruvec, lru);
1979 scan = size >> sc->priority; 1986 scan = size >> sc->priority;
1980 1987
1981 if (!scan && force_scan) 1988 if (!scan && force_scan)
1982 scan = min(size, SWAP_CLUSTER_MAX); 1989 scan = min(size, SWAP_CLUSTER_MAX);
1983 1990
1984 switch (scan_balance) { 1991 switch (scan_balance) {
1985 case SCAN_EQUAL: 1992 case SCAN_EQUAL:
1986 /* Scan lists relative to size */ 1993 /* Scan lists relative to size */
1987 break; 1994 break;
1988 case SCAN_FRACT: 1995 case SCAN_FRACT:
1989 /* 1996 /*
1990 * Scan types proportional to swappiness and 1997 * Scan types proportional to swappiness and
1991 * their relative recent reclaim efficiency. 1998 * their relative recent reclaim efficiency.
1992 */ 1999 */
1993 scan = div64_u64(scan * fraction[file], denominator); 2000 scan = div64_u64(scan * fraction[file], denominator);
1994 break; 2001 break;
1995 case SCAN_FILE: 2002 case SCAN_FILE:
1996 case SCAN_ANON: 2003 case SCAN_ANON:
1997 /* Scan one type exclusively */ 2004 /* Scan one type exclusively */
1998 if ((scan_balance == SCAN_FILE) != file) 2005 if ((scan_balance == SCAN_FILE) != file)
1999 scan = 0; 2006 scan = 0;
2000 break; 2007 break;
2001 default: 2008 default:
2002 /* Look ma, no brain */ 2009 /* Look ma, no brain */
2003 BUG(); 2010 BUG();
2004 } 2011 }
2005 nr[lru] = scan; 2012 nr[lru] = scan;
2006 } 2013 }
2007 } 2014 }
2008 2015
2009 /* 2016 /*
2010 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2017 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2011 */ 2018 */
2012 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 2019 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2013 { 2020 {
2014 unsigned long nr[NR_LRU_LISTS]; 2021 unsigned long nr[NR_LRU_LISTS];
2015 unsigned long targets[NR_LRU_LISTS]; 2022 unsigned long targets[NR_LRU_LISTS];
2016 unsigned long nr_to_scan; 2023 unsigned long nr_to_scan;
2017 enum lru_list lru; 2024 enum lru_list lru;
2018 unsigned long nr_reclaimed = 0; 2025 unsigned long nr_reclaimed = 0;
2019 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2026 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2020 struct blk_plug plug; 2027 struct blk_plug plug;
2021 bool scan_adjusted; 2028 bool scan_adjusted;
2022 2029
2023 get_scan_count(lruvec, sc, nr); 2030 get_scan_count(lruvec, sc, nr);
2024 2031
2025 /* Record the original scan target for proportional adjustments later */ 2032 /* Record the original scan target for proportional adjustments later */
2026 memcpy(targets, nr, sizeof(nr)); 2033 memcpy(targets, nr, sizeof(nr));
2027 2034
2028 /* 2035 /*
2029 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 2036 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2030 * event that can occur when there is little memory pressure e.g. 2037 * event that can occur when there is little memory pressure e.g.
2031 * multiple streaming readers/writers. Hence, we do not abort scanning 2038 * multiple streaming readers/writers. Hence, we do not abort scanning
2032 * when the requested number of pages are reclaimed when scanning at 2039 * when the requested number of pages are reclaimed when scanning at
2033 * DEF_PRIORITY on the assumption that the fact we are direct 2040 * DEF_PRIORITY on the assumption that the fact we are direct
2034 * reclaiming implies that kswapd is not keeping up and it is best to 2041 * reclaiming implies that kswapd is not keeping up and it is best to
2035 * do a batch of work at once. For memcg reclaim one check is made to 2042 * do a batch of work at once. For memcg reclaim one check is made to
2036 * abort proportional reclaim if either the file or anon lru has already 2043 * abort proportional reclaim if either the file or anon lru has already
2037 * dropped to zero at the first pass. 2044 * dropped to zero at the first pass.
2038 */ 2045 */
2039 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && 2046 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2040 sc->priority == DEF_PRIORITY); 2047 sc->priority == DEF_PRIORITY);
2041 2048
2042 blk_start_plug(&plug); 2049 blk_start_plug(&plug);
2043 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2050 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2044 nr[LRU_INACTIVE_FILE]) { 2051 nr[LRU_INACTIVE_FILE]) {
2045 unsigned long nr_anon, nr_file, percentage; 2052 unsigned long nr_anon, nr_file, percentage;
2046 unsigned long nr_scanned; 2053 unsigned long nr_scanned;
2047 2054
2048 for_each_evictable_lru(lru) { 2055 for_each_evictable_lru(lru) {
2049 if (nr[lru]) { 2056 if (nr[lru]) {
2050 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 2057 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2051 nr[lru] -= nr_to_scan; 2058 nr[lru] -= nr_to_scan;
2052 2059
2053 nr_reclaimed += shrink_list(lru, nr_to_scan, 2060 nr_reclaimed += shrink_list(lru, nr_to_scan,
2054 lruvec, sc); 2061 lruvec, sc);
2055 } 2062 }
2056 } 2063 }
2057 2064
2058 if (nr_reclaimed < nr_to_reclaim || scan_adjusted) 2065 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2059 continue; 2066 continue;
2060 2067
2061 /* 2068 /*
2062 * For kswapd and memcg, reclaim at least the number of pages 2069 * For kswapd and memcg, reclaim at least the number of pages
2063 * requested. Ensure that the anon and file LRUs are scanned 2070 * requested. Ensure that the anon and file LRUs are scanned
2064 * proportionally what was requested by get_scan_count(). We 2071 * proportionally what was requested by get_scan_count(). We
2065 * stop reclaiming one LRU and reduce the amount scanning 2072 * stop reclaiming one LRU and reduce the amount scanning
2066 * proportional to the original scan target. 2073 * proportional to the original scan target.
2067 */ 2074 */
2068 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2075 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2069 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2076 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2070 2077
2071 /* 2078 /*
2072 * It's just vindictive to attack the larger once the smaller 2079 * It's just vindictive to attack the larger once the smaller
2073 * has gone to zero. And given the way we stop scanning the 2080 * has gone to zero. And given the way we stop scanning the
2074 * smaller below, this makes sure that we only make one nudge 2081 * smaller below, this makes sure that we only make one nudge
2075 * towards proportionality once we've got nr_to_reclaim. 2082 * towards proportionality once we've got nr_to_reclaim.
2076 */ 2083 */
2077 if (!nr_file || !nr_anon) 2084 if (!nr_file || !nr_anon)
2078 break; 2085 break;
2079 2086
2080 if (nr_file > nr_anon) { 2087 if (nr_file > nr_anon) {
2081 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2088 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2082 targets[LRU_ACTIVE_ANON] + 1; 2089 targets[LRU_ACTIVE_ANON] + 1;
2083 lru = LRU_BASE; 2090 lru = LRU_BASE;
2084 percentage = nr_anon * 100 / scan_target; 2091 percentage = nr_anon * 100 / scan_target;
2085 } else { 2092 } else {
2086 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 2093 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2087 targets[LRU_ACTIVE_FILE] + 1; 2094 targets[LRU_ACTIVE_FILE] + 1;
2088 lru = LRU_FILE; 2095 lru = LRU_FILE;
2089 percentage = nr_file * 100 / scan_target; 2096 percentage = nr_file * 100 / scan_target;
2090 } 2097 }
2091 2098
2092 /* Stop scanning the smaller of the LRU */ 2099 /* Stop scanning the smaller of the LRU */
2093 nr[lru] = 0; 2100 nr[lru] = 0;
2094 nr[lru + LRU_ACTIVE] = 0; 2101 nr[lru + LRU_ACTIVE] = 0;
2095 2102
2096 /* 2103 /*
2097 * Recalculate the other LRU scan count based on its original 2104 * Recalculate the other LRU scan count based on its original
2098 * scan target and the percentage scanning already complete 2105 * scan target and the percentage scanning already complete
2099 */ 2106 */
2100 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 2107 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2101 nr_scanned = targets[lru] - nr[lru]; 2108 nr_scanned = targets[lru] - nr[lru];
2102 nr[lru] = targets[lru] * (100 - percentage) / 100; 2109 nr[lru] = targets[lru] * (100 - percentage) / 100;
2103 nr[lru] -= min(nr[lru], nr_scanned); 2110 nr[lru] -= min(nr[lru], nr_scanned);
2104 2111
2105 lru += LRU_ACTIVE; 2112 lru += LRU_ACTIVE;
2106 nr_scanned = targets[lru] - nr[lru]; 2113 nr_scanned = targets[lru] - nr[lru];
2107 nr[lru] = targets[lru] * (100 - percentage) / 100; 2114 nr[lru] = targets[lru] * (100 - percentage) / 100;
2108 nr[lru] -= min(nr[lru], nr_scanned); 2115 nr[lru] -= min(nr[lru], nr_scanned);
2109 2116
2110 scan_adjusted = true; 2117 scan_adjusted = true;
2111 } 2118 }
2112 blk_finish_plug(&plug); 2119 blk_finish_plug(&plug);
2113 sc->nr_reclaimed += nr_reclaimed; 2120 sc->nr_reclaimed += nr_reclaimed;
2114 2121
2115 /* 2122 /*
2116 * Even if we did not try to evict anon pages at all, we want to 2123 * Even if we did not try to evict anon pages at all, we want to
2117 * rebalance the anon lru active/inactive ratio. 2124 * rebalance the anon lru active/inactive ratio.
2118 */ 2125 */
2119 if (inactive_anon_is_low(lruvec)) 2126 if (inactive_anon_is_low(lruvec))
2120 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2127 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2121 sc, LRU_ACTIVE_ANON); 2128 sc, LRU_ACTIVE_ANON);
2122 2129
2123 throttle_vm_writeout(sc->gfp_mask); 2130 throttle_vm_writeout(sc->gfp_mask);
2124 } 2131 }
2125 2132
2126 /* Use reclaim/compaction for costly allocs or under memory pressure */ 2133 /* Use reclaim/compaction for costly allocs or under memory pressure */
2127 static bool in_reclaim_compaction(struct scan_control *sc) 2134 static bool in_reclaim_compaction(struct scan_control *sc)
2128 { 2135 {
2129 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2136 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2130 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 2137 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2131 sc->priority < DEF_PRIORITY - 2)) 2138 sc->priority < DEF_PRIORITY - 2))
2132 return true; 2139 return true;
2133 2140
2134 return false; 2141 return false;
2135 } 2142 }
2136 2143
2137 /* 2144 /*
2138 * Reclaim/compaction is used for high-order allocation requests. It reclaims 2145 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2139 * order-0 pages before compacting the zone. should_continue_reclaim() returns 2146 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2140 * true if more pages should be reclaimed such that when the page allocator 2147 * true if more pages should be reclaimed such that when the page allocator
2141 * calls try_to_compact_zone() that it will have enough free pages to succeed. 2148 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2142 * It will give up earlier than that if there is difficulty reclaiming pages. 2149 * It will give up earlier than that if there is difficulty reclaiming pages.
2143 */ 2150 */
2144 static inline bool should_continue_reclaim(struct zone *zone, 2151 static inline bool should_continue_reclaim(struct zone *zone,
2145 unsigned long nr_reclaimed, 2152 unsigned long nr_reclaimed,
2146 unsigned long nr_scanned, 2153 unsigned long nr_scanned,
2147 struct scan_control *sc) 2154 struct scan_control *sc)
2148 { 2155 {
2149 unsigned long pages_for_compaction; 2156 unsigned long pages_for_compaction;
2150 unsigned long inactive_lru_pages; 2157 unsigned long inactive_lru_pages;
2151 2158
2152 /* If not in reclaim/compaction mode, stop */ 2159 /* If not in reclaim/compaction mode, stop */
2153 if (!in_reclaim_compaction(sc)) 2160 if (!in_reclaim_compaction(sc))
2154 return false; 2161 return false;
2155 2162
2156 /* Consider stopping depending on scan and reclaim activity */ 2163 /* Consider stopping depending on scan and reclaim activity */
2157 if (sc->gfp_mask & __GFP_REPEAT) { 2164 if (sc->gfp_mask & __GFP_REPEAT) {
2158 /* 2165 /*
2159 * For __GFP_REPEAT allocations, stop reclaiming if the 2166 * For __GFP_REPEAT allocations, stop reclaiming if the
2160 * full LRU list has been scanned and we are still failing 2167 * full LRU list has been scanned and we are still failing
2161 * to reclaim pages. This full LRU scan is potentially 2168 * to reclaim pages. This full LRU scan is potentially
2162 * expensive but a __GFP_REPEAT caller really wants to succeed 2169 * expensive but a __GFP_REPEAT caller really wants to succeed
2163 */ 2170 */
2164 if (!nr_reclaimed && !nr_scanned) 2171 if (!nr_reclaimed && !nr_scanned)
2165 return false; 2172 return false;
2166 } else { 2173 } else {
2167 /* 2174 /*
2168 * For non-__GFP_REPEAT allocations which can presumably 2175 * For non-__GFP_REPEAT allocations which can presumably
2169 * fail without consequence, stop if we failed to reclaim 2176 * fail without consequence, stop if we failed to reclaim
2170 * any pages from the last SWAP_CLUSTER_MAX number of 2177 * any pages from the last SWAP_CLUSTER_MAX number of
2171 * pages that were scanned. This will return to the 2178 * pages that were scanned. This will return to the
2172 * caller faster at the risk reclaim/compaction and 2179 * caller faster at the risk reclaim/compaction and
2173 * the resulting allocation attempt fails 2180 * the resulting allocation attempt fails
2174 */ 2181 */
2175 if (!nr_reclaimed) 2182 if (!nr_reclaimed)
2176 return false; 2183 return false;
2177 } 2184 }
2178 2185
2179 /* 2186 /*
2180 * If we have not reclaimed enough pages for compaction and the 2187 * If we have not reclaimed enough pages for compaction and the
2181 * inactive lists are large enough, continue reclaiming 2188 * inactive lists are large enough, continue reclaiming
2182 */ 2189 */
2183 pages_for_compaction = (2UL << sc->order); 2190 pages_for_compaction = (2UL << sc->order);
2184 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); 2191 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
2185 if (get_nr_swap_pages() > 0) 2192 if (get_nr_swap_pages() > 0)
2186 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); 2193 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
2187 if (sc->nr_reclaimed < pages_for_compaction && 2194 if (sc->nr_reclaimed < pages_for_compaction &&
2188 inactive_lru_pages > pages_for_compaction) 2195 inactive_lru_pages > pages_for_compaction)
2189 return true; 2196 return true;
2190 2197
2191 /* If compaction would go ahead or the allocation would succeed, stop */ 2198 /* If compaction would go ahead or the allocation would succeed, stop */
2192 switch (compaction_suitable(zone, sc->order)) { 2199 switch (compaction_suitable(zone, sc->order)) {
2193 case COMPACT_PARTIAL: 2200 case COMPACT_PARTIAL:
2194 case COMPACT_CONTINUE: 2201 case COMPACT_CONTINUE:
2195 return false; 2202 return false;
2196 default: 2203 default:
2197 return true; 2204 return true;
2198 } 2205 }
2199 } 2206 }
2200 2207
2201 static void shrink_zone(struct zone *zone, struct scan_control *sc) 2208 static void shrink_zone(struct zone *zone, struct scan_control *sc)
2202 { 2209 {
2203 unsigned long nr_reclaimed, nr_scanned; 2210 unsigned long nr_reclaimed, nr_scanned;
2204 2211
2205 do { 2212 do {
2206 struct mem_cgroup *root = sc->target_mem_cgroup; 2213 struct mem_cgroup *root = sc->target_mem_cgroup;
2207 struct mem_cgroup_reclaim_cookie reclaim = { 2214 struct mem_cgroup_reclaim_cookie reclaim = {
2208 .zone = zone, 2215 .zone = zone,
2209 .priority = sc->priority, 2216 .priority = sc->priority,
2210 }; 2217 };
2211 struct mem_cgroup *memcg; 2218 struct mem_cgroup *memcg;
2212 2219
2213 nr_reclaimed = sc->nr_reclaimed; 2220 nr_reclaimed = sc->nr_reclaimed;
2214 nr_scanned = sc->nr_scanned; 2221 nr_scanned = sc->nr_scanned;
2215 2222
2216 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2223 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2217 do { 2224 do {
2218 struct lruvec *lruvec; 2225 struct lruvec *lruvec;
2219 2226
2220 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2227 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2221 2228
2222 shrink_lruvec(lruvec, sc); 2229 shrink_lruvec(lruvec, sc);
2223 2230
2224 /* 2231 /*
2225 * Direct reclaim and kswapd have to scan all memory 2232 * Direct reclaim and kswapd have to scan all memory
2226 * cgroups to fulfill the overall scan target for the 2233 * cgroups to fulfill the overall scan target for the
2227 * zone. 2234 * zone.
2228 * 2235 *
2229 * Limit reclaim, on the other hand, only cares about 2236 * Limit reclaim, on the other hand, only cares about
2230 * nr_to_reclaim pages to be reclaimed and it will 2237 * nr_to_reclaim pages to be reclaimed and it will
2231 * retry with decreasing priority if one round over the 2238 * retry with decreasing priority if one round over the
2232 * whole hierarchy is not sufficient. 2239 * whole hierarchy is not sufficient.
2233 */ 2240 */
2234 if (!global_reclaim(sc) && 2241 if (!global_reclaim(sc) &&
2235 sc->nr_reclaimed >= sc->nr_to_reclaim) { 2242 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2236 mem_cgroup_iter_break(root, memcg); 2243 mem_cgroup_iter_break(root, memcg);
2237 break; 2244 break;
2238 } 2245 }
2239 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2246 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2240 } while (memcg); 2247 } while (memcg);
2241 2248
2242 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2249 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2243 sc->nr_scanned - nr_scanned, 2250 sc->nr_scanned - nr_scanned,
2244 sc->nr_reclaimed - nr_reclaimed); 2251 sc->nr_reclaimed - nr_reclaimed);
2245 2252
2246 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2253 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2247 sc->nr_scanned - nr_scanned, sc)); 2254 sc->nr_scanned - nr_scanned, sc));
2248 } 2255 }
2249 2256
2250 /* Returns true if compaction should go ahead for a high-order request */ 2257 /* Returns true if compaction should go ahead for a high-order request */
2251 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2258 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2252 { 2259 {
2253 unsigned long balance_gap, watermark; 2260 unsigned long balance_gap, watermark;
2254 bool watermark_ok; 2261 bool watermark_ok;
2255 2262
2256 /* Do not consider compaction for orders reclaim is meant to satisfy */ 2263 /* Do not consider compaction for orders reclaim is meant to satisfy */
2257 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) 2264 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2258 return false; 2265 return false;
2259 2266
2260 /* 2267 /*
2261 * Compaction takes time to run and there are potentially other 2268 * Compaction takes time to run and there are potentially other
2262 * callers using the pages just freed. Continue reclaiming until 2269 * callers using the pages just freed. Continue reclaiming until
2263 * there is a buffer of free pages available to give compaction 2270 * there is a buffer of free pages available to give compaction
2264 * a reasonable chance of completing and allocating the page 2271 * a reasonable chance of completing and allocating the page
2265 */ 2272 */
2266 balance_gap = min(low_wmark_pages(zone), 2273 balance_gap = min(low_wmark_pages(zone),
2267 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2274 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2268 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2275 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2269 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2276 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2270 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2277 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2271 2278
2272 /* 2279 /*
2273 * If compaction is deferred, reclaim up to a point where 2280 * If compaction is deferred, reclaim up to a point where
2274 * compaction will have a chance of success when re-enabled 2281 * compaction will have a chance of success when re-enabled
2275 */ 2282 */
2276 if (compaction_deferred(zone, sc->order)) 2283 if (compaction_deferred(zone, sc->order))
2277 return watermark_ok; 2284 return watermark_ok;
2278 2285
2279 /* If compaction is not ready to start, keep reclaiming */ 2286 /* If compaction is not ready to start, keep reclaiming */
2280 if (!compaction_suitable(zone, sc->order)) 2287 if (!compaction_suitable(zone, sc->order))
2281 return false; 2288 return false;
2282 2289
2283 return watermark_ok; 2290 return watermark_ok;
2284 } 2291 }
2285 2292
2286 /* 2293 /*
2287 * This is the direct reclaim path, for page-allocating processes. We only 2294 * This is the direct reclaim path, for page-allocating processes. We only
2288 * try to reclaim pages from zones which will satisfy the caller's allocation 2295 * try to reclaim pages from zones which will satisfy the caller's allocation
2289 * request. 2296 * request.
2290 * 2297 *
2291 * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2298 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2292 * Because: 2299 * Because:
2293 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2300 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2294 * allocation or 2301 * allocation or
2295 * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2302 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2296 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2303 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2297 * zone defense algorithm. 2304 * zone defense algorithm.
2298 * 2305 *
2299 * If a zone is deemed to be full of pinned pages then just give it a light 2306 * If a zone is deemed to be full of pinned pages then just give it a light
2300 * scan then give up on it. 2307 * scan then give up on it.
2301 * 2308 *
2302 * This function returns true if a zone is being reclaimed for a costly 2309 * This function returns true if a zone is being reclaimed for a costly
2303 * high-order allocation and compaction is ready to begin. This indicates to 2310 * high-order allocation and compaction is ready to begin. This indicates to
2304 * the caller that it should consider retrying the allocation instead of 2311 * the caller that it should consider retrying the allocation instead of
2305 * further reclaim. 2312 * further reclaim.
2306 */ 2313 */
2307 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 2314 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2308 { 2315 {
2309 struct zoneref *z; 2316 struct zoneref *z;
2310 struct zone *zone; 2317 struct zone *zone;
2311 unsigned long nr_soft_reclaimed; 2318 unsigned long nr_soft_reclaimed;
2312 unsigned long nr_soft_scanned; 2319 unsigned long nr_soft_scanned;
2313 bool aborted_reclaim = false; 2320 bool aborted_reclaim = false;
2314 2321
2315 /* 2322 /*
2316 * If the number of buffer_heads in the machine exceeds the maximum 2323 * If the number of buffer_heads in the machine exceeds the maximum
2317 * allowed level, force direct reclaim to scan the highmem zone as 2324 * allowed level, force direct reclaim to scan the highmem zone as
2318 * highmem pages could be pinning lowmem pages storing buffer_heads 2325 * highmem pages could be pinning lowmem pages storing buffer_heads
2319 */ 2326 */
2320 if (buffer_heads_over_limit) 2327 if (buffer_heads_over_limit)
2321 sc->gfp_mask |= __GFP_HIGHMEM; 2328 sc->gfp_mask |= __GFP_HIGHMEM;
2322 2329
2323 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2330 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2324 gfp_zone(sc->gfp_mask), sc->nodemask) { 2331 gfp_zone(sc->gfp_mask), sc->nodemask) {
2325 if (!populated_zone(zone)) 2332 if (!populated_zone(zone))
2326 continue; 2333 continue;
2327 /* 2334 /*
2328 * Take care memory controller reclaiming has small influence 2335 * Take care memory controller reclaiming has small influence
2329 * to global LRU. 2336 * to global LRU.
2330 */ 2337 */
2331 if (global_reclaim(sc)) { 2338 if (global_reclaim(sc)) {
2332 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2339 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2333 continue; 2340 continue;
2334 if (sc->priority != DEF_PRIORITY && 2341 if (sc->priority != DEF_PRIORITY &&
2335 !zone_reclaimable(zone)) 2342 !zone_reclaimable(zone))
2336 continue; /* Let kswapd poll it */ 2343 continue; /* Let kswapd poll it */
2337 if (IS_ENABLED(CONFIG_COMPACTION)) { 2344 if (IS_ENABLED(CONFIG_COMPACTION)) {
2338 /* 2345 /*
2339 * If we already have plenty of memory free for 2346 * If we already have plenty of memory free for
2340 * compaction in this zone, don't free any more. 2347 * compaction in this zone, don't free any more.
2341 * Even though compaction is invoked for any 2348 * Even though compaction is invoked for any
2342 * non-zero order, only frequent costly order 2349 * non-zero order, only frequent costly order
2343 * reclamation is disruptive enough to become a 2350 * reclamation is disruptive enough to become a
2344 * noticeable problem, like transparent huge 2351 * noticeable problem, like transparent huge
2345 * page allocations. 2352 * page allocations.
2346 */ 2353 */
2347 if (compaction_ready(zone, sc)) { 2354 if (compaction_ready(zone, sc)) {
2348 aborted_reclaim = true; 2355 aborted_reclaim = true;
2349 continue; 2356 continue;
2350 } 2357 }
2351 } 2358 }
2352 /* 2359 /*
2353 * This steals pages from memory cgroups over softlimit 2360 * This steals pages from memory cgroups over softlimit
2354 * and returns the number of reclaimed pages and 2361 * and returns the number of reclaimed pages and
2355 * scanned pages. This works for global memory pressure 2362 * scanned pages. This works for global memory pressure
2356 * and balancing, not for a memcg's limit. 2363 * and balancing, not for a memcg's limit.
2357 */ 2364 */
2358 nr_soft_scanned = 0; 2365 nr_soft_scanned = 0;
2359 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2366 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2360 sc->order, sc->gfp_mask, 2367 sc->order, sc->gfp_mask,
2361 &nr_soft_scanned); 2368 &nr_soft_scanned);
2362 sc->nr_reclaimed += nr_soft_reclaimed; 2369 sc->nr_reclaimed += nr_soft_reclaimed;
2363 sc->nr_scanned += nr_soft_scanned; 2370 sc->nr_scanned += nr_soft_scanned;
2364 /* need some check for avoid more shrink_zone() */ 2371 /* need some check for avoid more shrink_zone() */
2365 } 2372 }
2366 2373
2367 shrink_zone(zone, sc); 2374 shrink_zone(zone, sc);
2368 } 2375 }
2369 2376
2370 return aborted_reclaim; 2377 return aborted_reclaim;
2371 } 2378 }
2372 2379
2373 /* All zones in zonelist are unreclaimable? */ 2380 /* All zones in zonelist are unreclaimable? */
2374 static bool all_unreclaimable(struct zonelist *zonelist, 2381 static bool all_unreclaimable(struct zonelist *zonelist,
2375 struct scan_control *sc) 2382 struct scan_control *sc)
2376 { 2383 {
2377 struct zoneref *z; 2384 struct zoneref *z;
2378 struct zone *zone; 2385 struct zone *zone;
2379 2386
2380 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2387 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2381 gfp_zone(sc->gfp_mask), sc->nodemask) { 2388 gfp_zone(sc->gfp_mask), sc->nodemask) {
2382 if (!populated_zone(zone)) 2389 if (!populated_zone(zone))
2383 continue; 2390 continue;
2384 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2391 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2385 continue; 2392 continue;
2386 if (zone_reclaimable(zone)) 2393 if (zone_reclaimable(zone))
2387 return false; 2394 return false;
2388 } 2395 }
2389 2396
2390 return true; 2397 return true;
2391 } 2398 }
2392 2399
2393 /* 2400 /*
2394 * This is the main entry point to direct page reclaim. 2401 * This is the main entry point to direct page reclaim.
2395 * 2402 *
2396 * If a full scan of the inactive list fails to free enough memory then we 2403 * If a full scan of the inactive list fails to free enough memory then we
2397 * are "out of memory" and something needs to be killed. 2404 * are "out of memory" and something needs to be killed.
2398 * 2405 *
2399 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2406 * If the caller is !__GFP_FS then the probability of a failure is reasonably
2400 * high - the zone may be full of dirty or under-writeback pages, which this 2407 * high - the zone may be full of dirty or under-writeback pages, which this
2401 * caller can't do much about. We kick the writeback threads and take explicit 2408 * caller can't do much about. We kick the writeback threads and take explicit
2402 * naps in the hope that some of these pages can be written. But if the 2409 * naps in the hope that some of these pages can be written. But if the
2403 * allocating task holds filesystem locks which prevent writeout this might not 2410 * allocating task holds filesystem locks which prevent writeout this might not
2404 * work, and the allocation attempt will fail. 2411 * work, and the allocation attempt will fail.
2405 * 2412 *
2406 * returns: 0, if no pages reclaimed 2413 * returns: 0, if no pages reclaimed
2407 * else, the number of pages reclaimed 2414 * else, the number of pages reclaimed
2408 */ 2415 */
2409 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2416 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2410 struct scan_control *sc, 2417 struct scan_control *sc,
2411 struct shrink_control *shrink) 2418 struct shrink_control *shrink)
2412 { 2419 {
2413 unsigned long total_scanned = 0; 2420 unsigned long total_scanned = 0;
2414 struct reclaim_state *reclaim_state = current->reclaim_state; 2421 struct reclaim_state *reclaim_state = current->reclaim_state;
2415 struct zoneref *z; 2422 struct zoneref *z;
2416 struct zone *zone; 2423 struct zone *zone;
2417 unsigned long writeback_threshold; 2424 unsigned long writeback_threshold;
2418 bool aborted_reclaim; 2425 bool aborted_reclaim;
2419 2426
2420 delayacct_freepages_start(); 2427 delayacct_freepages_start();
2421 2428
2422 if (global_reclaim(sc)) 2429 if (global_reclaim(sc))
2423 count_vm_event(ALLOCSTALL); 2430 count_vm_event(ALLOCSTALL);
2424 2431
2425 do { 2432 do {
2426 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2433 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2427 sc->priority); 2434 sc->priority);
2428 sc->nr_scanned = 0; 2435 sc->nr_scanned = 0;
2429 aborted_reclaim = shrink_zones(zonelist, sc); 2436 aborted_reclaim = shrink_zones(zonelist, sc);
2430 2437
2431 /* 2438 /*
2432 * Don't shrink slabs when reclaiming memory from over limit 2439 * Don't shrink slabs when reclaiming memory from over limit
2433 * cgroups but do shrink slab at least once when aborting 2440 * cgroups but do shrink slab at least once when aborting
2434 * reclaim for compaction to avoid unevenly scanning file/anon 2441 * reclaim for compaction to avoid unevenly scanning file/anon
2435 * LRU pages over slab pages. 2442 * LRU pages over slab pages.
2436 */ 2443 */
2437 if (global_reclaim(sc)) { 2444 if (global_reclaim(sc)) {
2438 unsigned long lru_pages = 0; 2445 unsigned long lru_pages = 0;
2439 2446
2440 nodes_clear(shrink->nodes_to_scan); 2447 nodes_clear(shrink->nodes_to_scan);
2441 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2448 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2442 gfp_zone(sc->gfp_mask), sc->nodemask) { 2449 gfp_zone(sc->gfp_mask), sc->nodemask) {
2443 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2450 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2444 continue; 2451 continue;
2445 2452
2446 lru_pages += zone_reclaimable_pages(zone); 2453 lru_pages += zone_reclaimable_pages(zone);
2447 node_set(zone_to_nid(zone), 2454 node_set(zone_to_nid(zone),
2448 shrink->nodes_to_scan); 2455 shrink->nodes_to_scan);
2449 } 2456 }
2450 2457
2451 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2458 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2452 if (reclaim_state) { 2459 if (reclaim_state) {
2453 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2460 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2454 reclaim_state->reclaimed_slab = 0; 2461 reclaim_state->reclaimed_slab = 0;
2455 } 2462 }
2456 } 2463 }
2457 total_scanned += sc->nr_scanned; 2464 total_scanned += sc->nr_scanned;
2458 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2465 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2459 goto out; 2466 goto out;
2460 2467
2461 /* 2468 /*
2462 * If we're getting trouble reclaiming, start doing 2469 * If we're getting trouble reclaiming, start doing
2463 * writepage even in laptop mode. 2470 * writepage even in laptop mode.
2464 */ 2471 */
2465 if (sc->priority < DEF_PRIORITY - 2) 2472 if (sc->priority < DEF_PRIORITY - 2)
2466 sc->may_writepage = 1; 2473 sc->may_writepage = 1;
2467 2474
2468 /* 2475 /*
2469 * Try to write back as many pages as we just scanned. This 2476 * Try to write back as many pages as we just scanned. This
2470 * tends to cause slow streaming writers to write data to the 2477 * tends to cause slow streaming writers to write data to the
2471 * disk smoothly, at the dirtying rate, which is nice. But 2478 * disk smoothly, at the dirtying rate, which is nice. But
2472 * that's undesirable in laptop mode, where we *want* lumpy 2479 * that's undesirable in laptop mode, where we *want* lumpy
2473 * writeout. So in laptop mode, write out the whole world. 2480 * writeout. So in laptop mode, write out the whole world.
2474 */ 2481 */
2475 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2482 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2476 if (total_scanned > writeback_threshold) { 2483 if (total_scanned > writeback_threshold) {
2477 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, 2484 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2478 WB_REASON_TRY_TO_FREE_PAGES); 2485 WB_REASON_TRY_TO_FREE_PAGES);
2479 sc->may_writepage = 1; 2486 sc->may_writepage = 1;
2480 } 2487 }
2481 } while (--sc->priority >= 0 && !aborted_reclaim); 2488 } while (--sc->priority >= 0 && !aborted_reclaim);
2482 2489
2483 out: 2490 out:
2484 delayacct_freepages_end(); 2491 delayacct_freepages_end();
2485 2492
2486 if (sc->nr_reclaimed) 2493 if (sc->nr_reclaimed)
2487 return sc->nr_reclaimed; 2494 return sc->nr_reclaimed;
2488 2495
2489 /* 2496 /*
2490 * As hibernation is going on, kswapd is freezed so that it can't mark 2497 * As hibernation is going on, kswapd is freezed so that it can't mark
2491 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2498 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2492 * check. 2499 * check.
2493 */ 2500 */
2494 if (oom_killer_disabled) 2501 if (oom_killer_disabled)
2495 return 0; 2502 return 0;
2496 2503
2497 /* Aborted reclaim to try compaction? don't OOM, then */ 2504 /* Aborted reclaim to try compaction? don't OOM, then */
2498 if (aborted_reclaim) 2505 if (aborted_reclaim)
2499 return 1; 2506 return 1;
2500 2507
2501 /* top priority shrink_zones still had more to do? don't OOM, then */ 2508 /* top priority shrink_zones still had more to do? don't OOM, then */
2502 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) 2509 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2503 return 1; 2510 return 1;
2504 2511
2505 return 0; 2512 return 0;
2506 } 2513 }
2507 2514
2508 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) 2515 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2509 { 2516 {
2510 struct zone *zone; 2517 struct zone *zone;
2511 unsigned long pfmemalloc_reserve = 0; 2518 unsigned long pfmemalloc_reserve = 0;
2512 unsigned long free_pages = 0; 2519 unsigned long free_pages = 0;
2513 int i; 2520 int i;
2514 bool wmark_ok; 2521 bool wmark_ok;
2515 2522
2516 for (i = 0; i <= ZONE_NORMAL; i++) { 2523 for (i = 0; i <= ZONE_NORMAL; i++) {
2517 zone = &pgdat->node_zones[i]; 2524 zone = &pgdat->node_zones[i];
2518 if (!populated_zone(zone)) 2525 if (!populated_zone(zone))
2519 continue; 2526 continue;
2520 2527
2521 pfmemalloc_reserve += min_wmark_pages(zone); 2528 pfmemalloc_reserve += min_wmark_pages(zone);
2522 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2529 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2523 } 2530 }
2524 2531
2525 /* If there are no reserves (unexpected config) then do not throttle */ 2532 /* If there are no reserves (unexpected config) then do not throttle */
2526 if (!pfmemalloc_reserve) 2533 if (!pfmemalloc_reserve)
2527 return true; 2534 return true;
2528 2535
2529 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2536 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2530 2537
2531 /* kswapd must be awake if processes are being throttled */ 2538 /* kswapd must be awake if processes are being throttled */
2532 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 2539 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2533 pgdat->classzone_idx = min(pgdat->classzone_idx, 2540 pgdat->classzone_idx = min(pgdat->classzone_idx,
2534 (enum zone_type)ZONE_NORMAL); 2541 (enum zone_type)ZONE_NORMAL);
2535 wake_up_interruptible(&pgdat->kswapd_wait); 2542 wake_up_interruptible(&pgdat->kswapd_wait);
2536 } 2543 }
2537 2544
2538 return wmark_ok; 2545 return wmark_ok;
2539 } 2546 }
2540 2547
2541 /* 2548 /*
2542 * Throttle direct reclaimers if backing storage is backed by the network 2549 * Throttle direct reclaimers if backing storage is backed by the network
2543 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2550 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2544 * depleted. kswapd will continue to make progress and wake the processes 2551 * depleted. kswapd will continue to make progress and wake the processes
2545 * when the low watermark is reached. 2552 * when the low watermark is reached.
2546 * 2553 *
2547 * Returns true if a fatal signal was delivered during throttling. If this 2554 * Returns true if a fatal signal was delivered during throttling. If this
2548 * happens, the page allocator should not consider triggering the OOM killer. 2555 * happens, the page allocator should not consider triggering the OOM killer.
2549 */ 2556 */
2550 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2557 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2551 nodemask_t *nodemask) 2558 nodemask_t *nodemask)
2552 { 2559 {
2553 struct zoneref *z; 2560 struct zoneref *z;
2554 struct zone *zone; 2561 struct zone *zone;
2555 pg_data_t *pgdat = NULL; 2562 pg_data_t *pgdat = NULL;
2556 2563
2557 /* 2564 /*
2558 * Kernel threads should not be throttled as they may be indirectly 2565 * Kernel threads should not be throttled as they may be indirectly
2559 * responsible for cleaning pages necessary for reclaim to make forward 2566 * responsible for cleaning pages necessary for reclaim to make forward
2560 * progress. kjournald for example may enter direct reclaim while 2567 * progress. kjournald for example may enter direct reclaim while
2561 * committing a transaction where throttling it could forcing other 2568 * committing a transaction where throttling it could forcing other
2562 * processes to block on log_wait_commit(). 2569 * processes to block on log_wait_commit().
2563 */ 2570 */
2564 if (current->flags & PF_KTHREAD) 2571 if (current->flags & PF_KTHREAD)
2565 goto out; 2572 goto out;
2566 2573
2567 /* 2574 /*
2568 * If a fatal signal is pending, this process should not throttle. 2575 * If a fatal signal is pending, this process should not throttle.
2569 * It should return quickly so it can exit and free its memory 2576 * It should return quickly so it can exit and free its memory
2570 */ 2577 */
2571 if (fatal_signal_pending(current)) 2578 if (fatal_signal_pending(current))
2572 goto out; 2579 goto out;
2573 2580
2574 /* 2581 /*
2575 * Check if the pfmemalloc reserves are ok by finding the first node 2582 * Check if the pfmemalloc reserves are ok by finding the first node
2576 * with a usable ZONE_NORMAL or lower zone. The expectation is that 2583 * with a usable ZONE_NORMAL or lower zone. The expectation is that
2577 * GFP_KERNEL will be required for allocating network buffers when 2584 * GFP_KERNEL will be required for allocating network buffers when
2578 * swapping over the network so ZONE_HIGHMEM is unusable. 2585 * swapping over the network so ZONE_HIGHMEM is unusable.
2579 * 2586 *
2580 * Throttling is based on the first usable node and throttled processes 2587 * Throttling is based on the first usable node and throttled processes
2581 * wait on a queue until kswapd makes progress and wakes them. There 2588 * wait on a queue until kswapd makes progress and wakes them. There
2582 * is an affinity then between processes waking up and where reclaim 2589 * is an affinity then between processes waking up and where reclaim
2583 * progress has been made assuming the process wakes on the same node. 2590 * progress has been made assuming the process wakes on the same node.
2584 * More importantly, processes running on remote nodes will not compete 2591 * More importantly, processes running on remote nodes will not compete
2585 * for remote pfmemalloc reserves and processes on different nodes 2592 * for remote pfmemalloc reserves and processes on different nodes
2586 * should make reasonable progress. 2593 * should make reasonable progress.
2587 */ 2594 */
2588 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2595 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2589 gfp_mask, nodemask) { 2596 gfp_mask, nodemask) {
2590 if (zone_idx(zone) > ZONE_NORMAL) 2597 if (zone_idx(zone) > ZONE_NORMAL)
2591 continue; 2598 continue;
2592 2599
2593 /* Throttle based on the first usable node */ 2600 /* Throttle based on the first usable node */
2594 pgdat = zone->zone_pgdat; 2601 pgdat = zone->zone_pgdat;
2595 if (pfmemalloc_watermark_ok(pgdat)) 2602 if (pfmemalloc_watermark_ok(pgdat))
2596 goto out; 2603 goto out;
2597 break; 2604 break;
2598 } 2605 }
2599 2606
2600 /* If no zone was usable by the allocation flags then do not throttle */ 2607 /* If no zone was usable by the allocation flags then do not throttle */
2601 if (!pgdat) 2608 if (!pgdat)
2602 goto out; 2609 goto out;
2603 2610
2604 /* Account for the throttling */ 2611 /* Account for the throttling */
2605 count_vm_event(PGSCAN_DIRECT_THROTTLE); 2612 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2606 2613
2607 /* 2614 /*
2608 * If the caller cannot enter the filesystem, it's possible that it 2615 * If the caller cannot enter the filesystem, it's possible that it
2609 * is due to the caller holding an FS lock or performing a journal 2616 * is due to the caller holding an FS lock or performing a journal
2610 * transaction in the case of a filesystem like ext[3|4]. In this case, 2617 * transaction in the case of a filesystem like ext[3|4]. In this case,
2611 * it is not safe to block on pfmemalloc_wait as kswapd could be 2618 * it is not safe to block on pfmemalloc_wait as kswapd could be
2612 * blocked waiting on the same lock. Instead, throttle for up to a 2619 * blocked waiting on the same lock. Instead, throttle for up to a
2613 * second before continuing. 2620 * second before continuing.
2614 */ 2621 */
2615 if (!(gfp_mask & __GFP_FS)) { 2622 if (!(gfp_mask & __GFP_FS)) {
2616 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2623 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2617 pfmemalloc_watermark_ok(pgdat), HZ); 2624 pfmemalloc_watermark_ok(pgdat), HZ);
2618 2625
2619 goto check_pending; 2626 goto check_pending;
2620 } 2627 }
2621 2628
2622 /* Throttle until kswapd wakes the process */ 2629 /* Throttle until kswapd wakes the process */
2623 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2630 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2624 pfmemalloc_watermark_ok(pgdat)); 2631 pfmemalloc_watermark_ok(pgdat));
2625 2632
2626 check_pending: 2633 check_pending:
2627 if (fatal_signal_pending(current)) 2634 if (fatal_signal_pending(current))
2628 return true; 2635 return true;
2629 2636
2630 out: 2637 out:
2631 return false; 2638 return false;
2632 } 2639 }
2633 2640
2634 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2641 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2635 gfp_t gfp_mask, nodemask_t *nodemask) 2642 gfp_t gfp_mask, nodemask_t *nodemask)
2636 { 2643 {
2637 unsigned long nr_reclaimed; 2644 unsigned long nr_reclaimed;
2638 struct scan_control sc = { 2645 struct scan_control sc = {
2639 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 2646 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2640 .may_writepage = !laptop_mode, 2647 .may_writepage = !laptop_mode,
2641 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2648 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2642 .may_unmap = 1, 2649 .may_unmap = 1,
2643 .may_swap = 1, 2650 .may_swap = 1,
2644 .order = order, 2651 .order = order,
2645 .priority = DEF_PRIORITY, 2652 .priority = DEF_PRIORITY,
2646 .target_mem_cgroup = NULL, 2653 .target_mem_cgroup = NULL,
2647 .nodemask = nodemask, 2654 .nodemask = nodemask,
2648 }; 2655 };
2649 struct shrink_control shrink = { 2656 struct shrink_control shrink = {
2650 .gfp_mask = sc.gfp_mask, 2657 .gfp_mask = sc.gfp_mask,
2651 }; 2658 };
2652 2659
2653 /* 2660 /*
2654 * Do not enter reclaim if fatal signal was delivered while throttled. 2661 * Do not enter reclaim if fatal signal was delivered while throttled.
2655 * 1 is returned so that the page allocator does not OOM kill at this 2662 * 1 is returned so that the page allocator does not OOM kill at this
2656 * point. 2663 * point.
2657 */ 2664 */
2658 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) 2665 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2659 return 1; 2666 return 1;
2660 2667
2661 trace_mm_vmscan_direct_reclaim_begin(order, 2668 trace_mm_vmscan_direct_reclaim_begin(order,
2662 sc.may_writepage, 2669 sc.may_writepage,
2663 gfp_mask); 2670 gfp_mask);
2664 2671
2665 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2672 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2666 2673
2667 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2674 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2668 2675
2669 return nr_reclaimed; 2676 return nr_reclaimed;
2670 } 2677 }
2671 2678
2672 #ifdef CONFIG_MEMCG 2679 #ifdef CONFIG_MEMCG
2673 2680
2674 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2681 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2675 gfp_t gfp_mask, bool noswap, 2682 gfp_t gfp_mask, bool noswap,
2676 struct zone *zone, 2683 struct zone *zone,
2677 unsigned long *nr_scanned) 2684 unsigned long *nr_scanned)
2678 { 2685 {
2679 struct scan_control sc = { 2686 struct scan_control sc = {
2680 .nr_scanned = 0, 2687 .nr_scanned = 0,
2681 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2688 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2682 .may_writepage = !laptop_mode, 2689 .may_writepage = !laptop_mode,
2683 .may_unmap = 1, 2690 .may_unmap = 1,
2684 .may_swap = !noswap, 2691 .may_swap = !noswap,
2685 .order = 0, 2692 .order = 0,
2686 .priority = 0, 2693 .priority = 0,
2687 .target_mem_cgroup = memcg, 2694 .target_mem_cgroup = memcg,
2688 }; 2695 };
2689 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2696 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2690 2697
2691 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2698 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2692 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2699 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2693 2700
2694 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 2701 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2695 sc.may_writepage, 2702 sc.may_writepage,
2696 sc.gfp_mask); 2703 sc.gfp_mask);
2697 2704
2698 /* 2705 /*
2699 * NOTE: Although we can get the priority field, using it 2706 * NOTE: Although we can get the priority field, using it
2700 * here is not a good idea, since it limits the pages we can scan. 2707 * here is not a good idea, since it limits the pages we can scan.
2701 * if we don't reclaim here, the shrink_zone from balance_pgdat 2708 * if we don't reclaim here, the shrink_zone from balance_pgdat
2702 * will pick up pages from other mem cgroup's as well. We hack 2709 * will pick up pages from other mem cgroup's as well. We hack
2703 * the priority and make it zero. 2710 * the priority and make it zero.
2704 */ 2711 */
2705 shrink_lruvec(lruvec, &sc); 2712 shrink_lruvec(lruvec, &sc);
2706 2713
2707 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2714 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2708 2715
2709 *nr_scanned = sc.nr_scanned; 2716 *nr_scanned = sc.nr_scanned;
2710 return sc.nr_reclaimed; 2717 return sc.nr_reclaimed;
2711 } 2718 }
2712 2719
2713 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 2720 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2714 gfp_t gfp_mask, 2721 gfp_t gfp_mask,
2715 bool noswap) 2722 bool noswap)
2716 { 2723 {
2717 struct zonelist *zonelist; 2724 struct zonelist *zonelist;
2718 unsigned long nr_reclaimed; 2725 unsigned long nr_reclaimed;
2719 int nid; 2726 int nid;
2720 struct scan_control sc = { 2727 struct scan_control sc = {
2721 .may_writepage = !laptop_mode, 2728 .may_writepage = !laptop_mode,
2722 .may_unmap = 1, 2729 .may_unmap = 1,
2723 .may_swap = !noswap, 2730 .may_swap = !noswap,
2724 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2731 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2725 .order = 0, 2732 .order = 0,
2726 .priority = DEF_PRIORITY, 2733 .priority = DEF_PRIORITY,
2727 .target_mem_cgroup = memcg, 2734 .target_mem_cgroup = memcg,
2728 .nodemask = NULL, /* we don't care the placement */ 2735 .nodemask = NULL, /* we don't care the placement */
2729 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2736 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2730 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2737 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2731 }; 2738 };
2732 struct shrink_control shrink = { 2739 struct shrink_control shrink = {
2733 .gfp_mask = sc.gfp_mask, 2740 .gfp_mask = sc.gfp_mask,
2734 }; 2741 };
2735 2742
2736 /* 2743 /*
2737 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2744 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2738 * take care of from where we get pages. So the node where we start the 2745 * take care of from where we get pages. So the node where we start the
2739 * scan does not need to be the current node. 2746 * scan does not need to be the current node.
2740 */ 2747 */
2741 nid = mem_cgroup_select_victim_node(memcg); 2748 nid = mem_cgroup_select_victim_node(memcg);
2742 2749
2743 zonelist = NODE_DATA(nid)->node_zonelists; 2750 zonelist = NODE_DATA(nid)->node_zonelists;
2744 2751
2745 trace_mm_vmscan_memcg_reclaim_begin(0, 2752 trace_mm_vmscan_memcg_reclaim_begin(0,
2746 sc.may_writepage, 2753 sc.may_writepage,
2747 sc.gfp_mask); 2754 sc.gfp_mask);
2748 2755
2749 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2756 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2750 2757
2751 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2758 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2752 2759
2753 return nr_reclaimed; 2760 return nr_reclaimed;
2754 } 2761 }
2755 #endif 2762 #endif
2756 2763
2757 static void age_active_anon(struct zone *zone, struct scan_control *sc) 2764 static void age_active_anon(struct zone *zone, struct scan_control *sc)
2758 { 2765 {
2759 struct mem_cgroup *memcg; 2766 struct mem_cgroup *memcg;
2760 2767
2761 if (!total_swap_pages) 2768 if (!total_swap_pages)
2762 return; 2769 return;
2763 2770
2764 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2771 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2765 do { 2772 do {
2766 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2773 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2767 2774
2768 if (inactive_anon_is_low(lruvec)) 2775 if (inactive_anon_is_low(lruvec))
2769 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2776 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2770 sc, LRU_ACTIVE_ANON); 2777 sc, LRU_ACTIVE_ANON);
2771 2778
2772 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2779 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2773 } while (memcg); 2780 } while (memcg);
2774 } 2781 }
2775 2782
2776 static bool zone_balanced(struct zone *zone, int order, 2783 static bool zone_balanced(struct zone *zone, int order,
2777 unsigned long balance_gap, int classzone_idx) 2784 unsigned long balance_gap, int classzone_idx)
2778 { 2785 {
2779 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + 2786 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2780 balance_gap, classzone_idx, 0)) 2787 balance_gap, classzone_idx, 0))
2781 return false; 2788 return false;
2782 2789
2783 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2790 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2784 !compaction_suitable(zone, order)) 2791 !compaction_suitable(zone, order))
2785 return false; 2792 return false;
2786 2793
2787 return true; 2794 return true;
2788 } 2795 }
2789 2796
2790 /* 2797 /*
2791 * pgdat_balanced() is used when checking if a node is balanced. 2798 * pgdat_balanced() is used when checking if a node is balanced.
2792 * 2799 *
2793 * For order-0, all zones must be balanced! 2800 * For order-0, all zones must be balanced!
2794 * 2801 *
2795 * For high-order allocations only zones that meet watermarks and are in a 2802 * For high-order allocations only zones that meet watermarks and are in a
2796 * zone allowed by the callers classzone_idx are added to balanced_pages. The 2803 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2797 * total of balanced pages must be at least 25% of the zones allowed by 2804 * total of balanced pages must be at least 25% of the zones allowed by
2798 * classzone_idx for the node to be considered balanced. Forcing all zones to 2805 * classzone_idx for the node to be considered balanced. Forcing all zones to
2799 * be balanced for high orders can cause excessive reclaim when there are 2806 * be balanced for high orders can cause excessive reclaim when there are
2800 * imbalanced zones. 2807 * imbalanced zones.
2801 * The choice of 25% is due to 2808 * The choice of 25% is due to
2802 * o a 16M DMA zone that is balanced will not balance a zone on any 2809 * o a 16M DMA zone that is balanced will not balance a zone on any
2803 * reasonable sized machine 2810 * reasonable sized machine
2804 * o On all other machines, the top zone must be at least a reasonable 2811 * o On all other machines, the top zone must be at least a reasonable
2805 * percentage of the middle zones. For example, on 32-bit x86, highmem 2812 * percentage of the middle zones. For example, on 32-bit x86, highmem
2806 * would need to be at least 256M for it to be balance a whole node. 2813 * would need to be at least 256M for it to be balance a whole node.
2807 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2814 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2808 * to balance a node on its own. These seemed like reasonable ratios. 2815 * to balance a node on its own. These seemed like reasonable ratios.
2809 */ 2816 */
2810 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2817 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2811 { 2818 {
2812 unsigned long managed_pages = 0; 2819 unsigned long managed_pages = 0;
2813 unsigned long balanced_pages = 0; 2820 unsigned long balanced_pages = 0;
2814 int i; 2821 int i;
2815 2822
2816 /* Check the watermark levels */ 2823 /* Check the watermark levels */
2817 for (i = 0; i <= classzone_idx; i++) { 2824 for (i = 0; i <= classzone_idx; i++) {
2818 struct zone *zone = pgdat->node_zones + i; 2825 struct zone *zone = pgdat->node_zones + i;
2819 2826
2820 if (!populated_zone(zone)) 2827 if (!populated_zone(zone))
2821 continue; 2828 continue;
2822 2829
2823 managed_pages += zone->managed_pages; 2830 managed_pages += zone->managed_pages;
2824 2831
2825 /* 2832 /*
2826 * A special case here: 2833 * A special case here:
2827 * 2834 *
2828 * balance_pgdat() skips over all_unreclaimable after 2835 * balance_pgdat() skips over all_unreclaimable after
2829 * DEF_PRIORITY. Effectively, it considers them balanced so 2836 * DEF_PRIORITY. Effectively, it considers them balanced so
2830 * they must be considered balanced here as well! 2837 * they must be considered balanced here as well!
2831 */ 2838 */
2832 if (!zone_reclaimable(zone)) { 2839 if (!zone_reclaimable(zone)) {
2833 balanced_pages += zone->managed_pages; 2840 balanced_pages += zone->managed_pages;
2834 continue; 2841 continue;
2835 } 2842 }
2836 2843
2837 if (zone_balanced(zone, order, 0, i)) 2844 if (zone_balanced(zone, order, 0, i))
2838 balanced_pages += zone->managed_pages; 2845 balanced_pages += zone->managed_pages;
2839 else if (!order) 2846 else if (!order)
2840 return false; 2847 return false;
2841 } 2848 }
2842 2849
2843 if (order) 2850 if (order)
2844 return balanced_pages >= (managed_pages >> 2); 2851 return balanced_pages >= (managed_pages >> 2);
2845 else 2852 else
2846 return true; 2853 return true;
2847 } 2854 }
2848 2855
2849 /* 2856 /*
2850 * Prepare kswapd for sleeping. This verifies that there are no processes 2857 * Prepare kswapd for sleeping. This verifies that there are no processes
2851 * waiting in throttle_direct_reclaim() and that watermarks have been met. 2858 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2852 * 2859 *
2853 * Returns true if kswapd is ready to sleep 2860 * Returns true if kswapd is ready to sleep
2854 */ 2861 */
2855 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2862 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2856 int classzone_idx) 2863 int classzone_idx)
2857 { 2864 {
2858 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2865 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2859 if (remaining) 2866 if (remaining)
2860 return false; 2867 return false;
2861 2868
2862 /* 2869 /*
2863 * There is a potential race between when kswapd checks its watermarks 2870 * There is a potential race between when kswapd checks its watermarks
2864 * and a process gets throttled. There is also a potential race if 2871 * and a process gets throttled. There is also a potential race if
2865 * processes get throttled, kswapd wakes, a large process exits therby 2872 * processes get throttled, kswapd wakes, a large process exits therby
2866 * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2873 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2867 * is going to sleep, no process should be sleeping on pfmemalloc_wait 2874 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2868 * so wake them now if necessary. If necessary, processes will wake 2875 * so wake them now if necessary. If necessary, processes will wake
2869 * kswapd and get throttled again 2876 * kswapd and get throttled again
2870 */ 2877 */
2871 if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2878 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2872 wake_up(&pgdat->pfmemalloc_wait); 2879 wake_up(&pgdat->pfmemalloc_wait);
2873 return false; 2880 return false;
2874 } 2881 }
2875 2882
2876 return pgdat_balanced(pgdat, order, classzone_idx); 2883 return pgdat_balanced(pgdat, order, classzone_idx);
2877 } 2884 }
2878 2885
2879 /* 2886 /*
2880 * kswapd shrinks the zone by the number of pages required to reach 2887 * kswapd shrinks the zone by the number of pages required to reach
2881 * the high watermark. 2888 * the high watermark.
2882 * 2889 *
2883 * Returns true if kswapd scanned at least the requested number of pages to 2890 * Returns true if kswapd scanned at least the requested number of pages to
2884 * reclaim or if the lack of progress was due to pages under writeback. 2891 * reclaim or if the lack of progress was due to pages under writeback.
2885 * This is used to determine if the scanning priority needs to be raised. 2892 * This is used to determine if the scanning priority needs to be raised.
2886 */ 2893 */
2887 static bool kswapd_shrink_zone(struct zone *zone, 2894 static bool kswapd_shrink_zone(struct zone *zone,
2888 int classzone_idx, 2895 int classzone_idx,
2889 struct scan_control *sc, 2896 struct scan_control *sc,
2890 unsigned long lru_pages, 2897 unsigned long lru_pages,
2891 unsigned long *nr_attempted) 2898 unsigned long *nr_attempted)
2892 { 2899 {
2893 int testorder = sc->order; 2900 int testorder = sc->order;
2894 unsigned long balance_gap; 2901 unsigned long balance_gap;
2895 struct reclaim_state *reclaim_state = current->reclaim_state; 2902 struct reclaim_state *reclaim_state = current->reclaim_state;
2896 struct shrink_control shrink = { 2903 struct shrink_control shrink = {
2897 .gfp_mask = sc->gfp_mask, 2904 .gfp_mask = sc->gfp_mask,
2898 }; 2905 };
2899 bool lowmem_pressure; 2906 bool lowmem_pressure;
2900 2907
2901 /* Reclaim above the high watermark. */ 2908 /* Reclaim above the high watermark. */
2902 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); 2909 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2903 2910
2904 /* 2911 /*
2905 * Kswapd reclaims only single pages with compaction enabled. Trying 2912 * Kswapd reclaims only single pages with compaction enabled. Trying
2906 * too hard to reclaim until contiguous free pages have become 2913 * too hard to reclaim until contiguous free pages have become
2907 * available can hurt performance by evicting too much useful data 2914 * available can hurt performance by evicting too much useful data
2908 * from memory. Do not reclaim more than needed for compaction. 2915 * from memory. Do not reclaim more than needed for compaction.
2909 */ 2916 */
2910 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2917 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2911 compaction_suitable(zone, sc->order) != 2918 compaction_suitable(zone, sc->order) !=
2912 COMPACT_SKIPPED) 2919 COMPACT_SKIPPED)
2913 testorder = 0; 2920 testorder = 0;
2914 2921
2915 /* 2922 /*
2916 * We put equal pressure on every zone, unless one zone has way too 2923 * We put equal pressure on every zone, unless one zone has way too
2917 * many pages free already. The "too many pages" is defined as the 2924 * many pages free already. The "too many pages" is defined as the
2918 * high wmark plus a "gap" where the gap is either the low 2925 * high wmark plus a "gap" where the gap is either the low
2919 * watermark or 1% of the zone, whichever is smaller. 2926 * watermark or 1% of the zone, whichever is smaller.
2920 */ 2927 */
2921 balance_gap = min(low_wmark_pages(zone), 2928 balance_gap = min(low_wmark_pages(zone),
2922 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2929 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2923 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2930 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2924 2931
2925 /* 2932 /*
2926 * If there is no low memory pressure or the zone is balanced then no 2933 * If there is no low memory pressure or the zone is balanced then no
2927 * reclaim is necessary 2934 * reclaim is necessary
2928 */ 2935 */
2929 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); 2936 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2930 if (!lowmem_pressure && zone_balanced(zone, testorder, 2937 if (!lowmem_pressure && zone_balanced(zone, testorder,
2931 balance_gap, classzone_idx)) 2938 balance_gap, classzone_idx))
2932 return true; 2939 return true;
2933 2940
2934 shrink_zone(zone, sc); 2941 shrink_zone(zone, sc);
2935 nodes_clear(shrink.nodes_to_scan); 2942 nodes_clear(shrink.nodes_to_scan);
2936 node_set(zone_to_nid(zone), shrink.nodes_to_scan); 2943 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2937 2944
2938 reclaim_state->reclaimed_slab = 0; 2945 reclaim_state->reclaimed_slab = 0;
2939 shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2946 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2940 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2947 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2941 2948
2942 /* Account for the number of pages attempted to reclaim */ 2949 /* Account for the number of pages attempted to reclaim */
2943 *nr_attempted += sc->nr_to_reclaim; 2950 *nr_attempted += sc->nr_to_reclaim;
2944 2951
2945 zone_clear_flag(zone, ZONE_WRITEBACK); 2952 zone_clear_flag(zone, ZONE_WRITEBACK);
2946 2953
2947 /* 2954 /*
2948 * If a zone reaches its high watermark, consider it to be no longer 2955 * If a zone reaches its high watermark, consider it to be no longer
2949 * congested. It's possible there are dirty pages backed by congested 2956 * congested. It's possible there are dirty pages backed by congested
2950 * BDIs but as pressure is relieved, speculatively avoid congestion 2957 * BDIs but as pressure is relieved, speculatively avoid congestion
2951 * waits. 2958 * waits.
2952 */ 2959 */
2953 if (zone_reclaimable(zone) && 2960 if (zone_reclaimable(zone) &&
2954 zone_balanced(zone, testorder, 0, classzone_idx)) { 2961 zone_balanced(zone, testorder, 0, classzone_idx)) {
2955 zone_clear_flag(zone, ZONE_CONGESTED); 2962 zone_clear_flag(zone, ZONE_CONGESTED);
2956 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2963 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2957 } 2964 }
2958 2965
2959 return sc->nr_scanned >= sc->nr_to_reclaim; 2966 return sc->nr_scanned >= sc->nr_to_reclaim;
2960 } 2967 }
2961 2968
2962 /* 2969 /*
2963 * For kswapd, balance_pgdat() will work across all this node's zones until 2970 * For kswapd, balance_pgdat() will work across all this node's zones until
2964 * they are all at high_wmark_pages(zone). 2971 * they are all at high_wmark_pages(zone).
2965 * 2972 *
2966 * Returns the final order kswapd was reclaiming at 2973 * Returns the final order kswapd was reclaiming at
2967 * 2974 *
2968 * There is special handling here for zones which are full of pinned pages. 2975 * There is special handling here for zones which are full of pinned pages.
2969 * This can happen if the pages are all mlocked, or if they are all used by 2976 * This can happen if the pages are all mlocked, or if they are all used by
2970 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2977 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
2971 * What we do is to detect the case where all pages in the zone have been 2978 * What we do is to detect the case where all pages in the zone have been
2972 * scanned twice and there has been zero successful reclaim. Mark the zone as 2979 * scanned twice and there has been zero successful reclaim. Mark the zone as
2973 * dead and from now on, only perform a short scan. Basically we're polling 2980 * dead and from now on, only perform a short scan. Basically we're polling
2974 * the zone for when the problem goes away. 2981 * the zone for when the problem goes away.
2975 * 2982 *
2976 * kswapd scans the zones in the highmem->normal->dma direction. It skips 2983 * kswapd scans the zones in the highmem->normal->dma direction. It skips
2977 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2984 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
2978 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2985 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
2979 * lower zones regardless of the number of free pages in the lower zones. This 2986 * lower zones regardless of the number of free pages in the lower zones. This
2980 * interoperates with the page allocator fallback scheme to ensure that aging 2987 * interoperates with the page allocator fallback scheme to ensure that aging
2981 * of pages is balanced across the zones. 2988 * of pages is balanced across the zones.
2982 */ 2989 */
2983 static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2990 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2984 int *classzone_idx) 2991 int *classzone_idx)
2985 { 2992 {
2986 int i; 2993 int i;
2987 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2994 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2988 unsigned long nr_soft_reclaimed; 2995 unsigned long nr_soft_reclaimed;
2989 unsigned long nr_soft_scanned; 2996 unsigned long nr_soft_scanned;
2990 struct scan_control sc = { 2997 struct scan_control sc = {
2991 .gfp_mask = GFP_KERNEL, 2998 .gfp_mask = GFP_KERNEL,
2992 .priority = DEF_PRIORITY, 2999 .priority = DEF_PRIORITY,
2993 .may_unmap = 1, 3000 .may_unmap = 1,
2994 .may_swap = 1, 3001 .may_swap = 1,
2995 .may_writepage = !laptop_mode, 3002 .may_writepage = !laptop_mode,
2996 .order = order, 3003 .order = order,
2997 .target_mem_cgroup = NULL, 3004 .target_mem_cgroup = NULL,
2998 }; 3005 };
2999 count_vm_event(PAGEOUTRUN); 3006 count_vm_event(PAGEOUTRUN);
3000 3007
3001 do { 3008 do {
3002 unsigned long lru_pages = 0; 3009 unsigned long lru_pages = 0;
3003 unsigned long nr_attempted = 0; 3010 unsigned long nr_attempted = 0;
3004 bool raise_priority = true; 3011 bool raise_priority = true;
3005 bool pgdat_needs_compaction = (order > 0); 3012 bool pgdat_needs_compaction = (order > 0);
3006 3013
3007 sc.nr_reclaimed = 0; 3014 sc.nr_reclaimed = 0;
3008 3015
3009 /* 3016 /*
3010 * Scan in the highmem->dma direction for the highest 3017 * Scan in the highmem->dma direction for the highest
3011 * zone which needs scanning 3018 * zone which needs scanning
3012 */ 3019 */
3013 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 3020 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
3014 struct zone *zone = pgdat->node_zones + i; 3021 struct zone *zone = pgdat->node_zones + i;
3015 3022
3016 if (!populated_zone(zone)) 3023 if (!populated_zone(zone))
3017 continue; 3024 continue;
3018 3025
3019 if (sc.priority != DEF_PRIORITY && 3026 if (sc.priority != DEF_PRIORITY &&
3020 !zone_reclaimable(zone)) 3027 !zone_reclaimable(zone))
3021 continue; 3028 continue;
3022 3029
3023 /* 3030 /*
3024 * Do some background aging of the anon list, to give 3031 * Do some background aging of the anon list, to give
3025 * pages a chance to be referenced before reclaiming. 3032 * pages a chance to be referenced before reclaiming.
3026 */ 3033 */
3027 age_active_anon(zone, &sc); 3034 age_active_anon(zone, &sc);
3028 3035
3029 /* 3036 /*
3030 * If the number of buffer_heads in the machine 3037 * If the number of buffer_heads in the machine
3031 * exceeds the maximum allowed level and this node 3038 * exceeds the maximum allowed level and this node
3032 * has a highmem zone, force kswapd to reclaim from 3039 * has a highmem zone, force kswapd to reclaim from
3033 * it to relieve lowmem pressure. 3040 * it to relieve lowmem pressure.
3034 */ 3041 */
3035 if (buffer_heads_over_limit && is_highmem_idx(i)) { 3042 if (buffer_heads_over_limit && is_highmem_idx(i)) {
3036 end_zone = i; 3043 end_zone = i;
3037 break; 3044 break;
3038 } 3045 }
3039 3046
3040 if (!zone_balanced(zone, order, 0, 0)) { 3047 if (!zone_balanced(zone, order, 0, 0)) {
3041 end_zone = i; 3048 end_zone = i;
3042 break; 3049 break;
3043 } else { 3050 } else {
3044 /* 3051 /*
3045 * If balanced, clear the dirty and congested 3052 * If balanced, clear the dirty and congested
3046 * flags 3053 * flags
3047 */ 3054 */
3048 zone_clear_flag(zone, ZONE_CONGESTED); 3055 zone_clear_flag(zone, ZONE_CONGESTED);
3049 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 3056 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
3050 } 3057 }
3051 } 3058 }
3052 3059
3053 if (i < 0) 3060 if (i < 0)
3054 goto out; 3061 goto out;
3055 3062
3056 for (i = 0; i <= end_zone; i++) { 3063 for (i = 0; i <= end_zone; i++) {
3057 struct zone *zone = pgdat->node_zones + i; 3064 struct zone *zone = pgdat->node_zones + i;
3058 3065
3059 if (!populated_zone(zone)) 3066 if (!populated_zone(zone))
3060 continue; 3067 continue;
3061 3068
3062 lru_pages += zone_reclaimable_pages(zone); 3069 lru_pages += zone_reclaimable_pages(zone);
3063 3070
3064 /* 3071 /*
3065 * If any zone is currently balanced then kswapd will 3072 * If any zone is currently balanced then kswapd will
3066 * not call compaction as it is expected that the 3073 * not call compaction as it is expected that the
3067 * necessary pages are already available. 3074 * necessary pages are already available.
3068 */ 3075 */
3069 if (pgdat_needs_compaction && 3076 if (pgdat_needs_compaction &&
3070 zone_watermark_ok(zone, order, 3077 zone_watermark_ok(zone, order,
3071 low_wmark_pages(zone), 3078 low_wmark_pages(zone),
3072 *classzone_idx, 0)) 3079 *classzone_idx, 0))
3073 pgdat_needs_compaction = false; 3080 pgdat_needs_compaction = false;
3074 } 3081 }
3075 3082
3076 /* 3083 /*
3077 * If we're getting trouble reclaiming, start doing writepage 3084 * If we're getting trouble reclaiming, start doing writepage
3078 * even in laptop mode. 3085 * even in laptop mode.
3079 */ 3086 */
3080 if (sc.priority < DEF_PRIORITY - 2) 3087 if (sc.priority < DEF_PRIORITY - 2)
3081 sc.may_writepage = 1; 3088 sc.may_writepage = 1;
3082 3089
3083 /* 3090 /*
3084 * Now scan the zone in the dma->highmem direction, stopping 3091 * Now scan the zone in the dma->highmem direction, stopping
3085 * at the last zone which needs scanning. 3092 * at the last zone which needs scanning.
3086 * 3093 *
3087 * We do this because the page allocator works in the opposite 3094 * We do this because the page allocator works in the opposite
3088 * direction. This prevents the page allocator from allocating 3095 * direction. This prevents the page allocator from allocating
3089 * pages behind kswapd's direction of progress, which would 3096 * pages behind kswapd's direction of progress, which would
3090 * cause too much scanning of the lower zones. 3097 * cause too much scanning of the lower zones.
3091 */ 3098 */
3092 for (i = 0; i <= end_zone; i++) { 3099 for (i = 0; i <= end_zone; i++) {
3093 struct zone *zone = pgdat->node_zones + i; 3100 struct zone *zone = pgdat->node_zones + i;
3094 3101
3095 if (!populated_zone(zone)) 3102 if (!populated_zone(zone))
3096 continue; 3103 continue;
3097 3104
3098 if (sc.priority != DEF_PRIORITY && 3105 if (sc.priority != DEF_PRIORITY &&
3099 !zone_reclaimable(zone)) 3106 !zone_reclaimable(zone))
3100 continue; 3107 continue;
3101 3108
3102 sc.nr_scanned = 0; 3109 sc.nr_scanned = 0;
3103 3110
3104 nr_soft_scanned = 0; 3111 nr_soft_scanned = 0;
3105 /* 3112 /*
3106 * Call soft limit reclaim before calling shrink_zone. 3113 * Call soft limit reclaim before calling shrink_zone.
3107 */ 3114 */
3108 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 3115 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3109 order, sc.gfp_mask, 3116 order, sc.gfp_mask,
3110 &nr_soft_scanned); 3117 &nr_soft_scanned);
3111 sc.nr_reclaimed += nr_soft_reclaimed; 3118 sc.nr_reclaimed += nr_soft_reclaimed;
3112 3119
3113 /* 3120 /*
3114 * There should be no need to raise the scanning 3121 * There should be no need to raise the scanning
3115 * priority if enough pages are already being scanned 3122 * priority if enough pages are already being scanned
3116 * that that high watermark would be met at 100% 3123 * that that high watermark would be met at 100%
3117 * efficiency. 3124 * efficiency.
3118 */ 3125 */
3119 if (kswapd_shrink_zone(zone, end_zone, &sc, 3126 if (kswapd_shrink_zone(zone, end_zone, &sc,
3120 lru_pages, &nr_attempted)) 3127 lru_pages, &nr_attempted))
3121 raise_priority = false; 3128 raise_priority = false;
3122 } 3129 }
3123 3130
3124 /* 3131 /*
3125 * If the low watermark is met there is no need for processes 3132 * If the low watermark is met there is no need for processes
3126 * to be throttled on pfmemalloc_wait as they should not be 3133 * to be throttled on pfmemalloc_wait as they should not be
3127 * able to safely make forward progress. Wake them 3134 * able to safely make forward progress. Wake them
3128 */ 3135 */
3129 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3136 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3130 pfmemalloc_watermark_ok(pgdat)) 3137 pfmemalloc_watermark_ok(pgdat))
3131 wake_up(&pgdat->pfmemalloc_wait); 3138 wake_up(&pgdat->pfmemalloc_wait);
3132 3139
3133 /* 3140 /*
3134 * Fragmentation may mean that the system cannot be rebalanced 3141 * Fragmentation may mean that the system cannot be rebalanced
3135 * for high-order allocations in all zones. If twice the 3142 * for high-order allocations in all zones. If twice the
3136 * allocation size has been reclaimed and the zones are still 3143 * allocation size has been reclaimed and the zones are still
3137 * not balanced then recheck the watermarks at order-0 to 3144 * not balanced then recheck the watermarks at order-0 to
3138 * prevent kswapd reclaiming excessively. Assume that a 3145 * prevent kswapd reclaiming excessively. Assume that a
3139 * process requested a high-order can direct reclaim/compact. 3146 * process requested a high-order can direct reclaim/compact.
3140 */ 3147 */
3141 if (order && sc.nr_reclaimed >= 2UL << order) 3148 if (order && sc.nr_reclaimed >= 2UL << order)
3142 order = sc.order = 0; 3149 order = sc.order = 0;
3143 3150
3144 /* Check if kswapd should be suspending */ 3151 /* Check if kswapd should be suspending */
3145 if (try_to_freeze() || kthread_should_stop()) 3152 if (try_to_freeze() || kthread_should_stop())
3146 break; 3153 break;
3147 3154
3148 /* 3155 /*
3149 * Compact if necessary and kswapd is reclaiming at least the 3156 * Compact if necessary and kswapd is reclaiming at least the
3150 * high watermark number of pages as requsted 3157 * high watermark number of pages as requsted
3151 */ 3158 */
3152 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) 3159 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
3153 compact_pgdat(pgdat, order); 3160 compact_pgdat(pgdat, order);
3154 3161
3155 /* 3162 /*
3156 * Raise priority if scanning rate is too low or there was no 3163 * Raise priority if scanning rate is too low or there was no
3157 * progress in reclaiming pages 3164 * progress in reclaiming pages
3158 */ 3165 */
3159 if (raise_priority || !sc.nr_reclaimed) 3166 if (raise_priority || !sc.nr_reclaimed)
3160 sc.priority--; 3167 sc.priority--;
3161 } while (sc.priority >= 1 && 3168 } while (sc.priority >= 1 &&
3162 !pgdat_balanced(pgdat, order, *classzone_idx)); 3169 !pgdat_balanced(pgdat, order, *classzone_idx));
3163 3170
3164 out: 3171 out:
3165 /* 3172 /*
3166 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3173 * Return the order we were reclaiming at so prepare_kswapd_sleep()
3167 * makes a decision on the order we were last reclaiming at. However, 3174 * makes a decision on the order we were last reclaiming at. However,
3168 * if another caller entered the allocator slow path while kswapd 3175 * if another caller entered the allocator slow path while kswapd
3169 * was awake, order will remain at the higher level 3176 * was awake, order will remain at the higher level
3170 */ 3177 */
3171 *classzone_idx = end_zone; 3178 *classzone_idx = end_zone;
3172 return order; 3179 return order;
3173 } 3180 }
3174 3181
3175 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3182 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3176 { 3183 {
3177 long remaining = 0; 3184 long remaining = 0;
3178 DEFINE_WAIT(wait); 3185 DEFINE_WAIT(wait);
3179 3186
3180 if (freezing(current) || kthread_should_stop()) 3187 if (freezing(current) || kthread_should_stop())
3181 return; 3188 return;
3182 3189
3183 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3190 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3184 3191
3185 /* Try to sleep for a short interval */ 3192 /* Try to sleep for a short interval */
3186 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3193 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3187 remaining = schedule_timeout(HZ/10); 3194 remaining = schedule_timeout(HZ/10);
3188 finish_wait(&pgdat->kswapd_wait, &wait); 3195 finish_wait(&pgdat->kswapd_wait, &wait);
3189 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3196 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3190 } 3197 }
3191 3198
3192 /* 3199 /*
3193 * After a short sleep, check if it was a premature sleep. If not, then 3200 * After a short sleep, check if it was a premature sleep. If not, then
3194 * go fully to sleep until explicitly woken up. 3201 * go fully to sleep until explicitly woken up.
3195 */ 3202 */
3196 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3203 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3197 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3204 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3198 3205
3199 /* 3206 /*
3200 * vmstat counters are not perfectly accurate and the estimated 3207 * vmstat counters are not perfectly accurate and the estimated
3201 * value for counters such as NR_FREE_PAGES can deviate from the 3208 * value for counters such as NR_FREE_PAGES can deviate from the
3202 * true value by nr_online_cpus * threshold. To avoid the zone 3209 * true value by nr_online_cpus * threshold. To avoid the zone
3203 * watermarks being breached while under pressure, we reduce the 3210 * watermarks being breached while under pressure, we reduce the
3204 * per-cpu vmstat threshold while kswapd is awake and restore 3211 * per-cpu vmstat threshold while kswapd is awake and restore
3205 * them before going back to sleep. 3212 * them before going back to sleep.
3206 */ 3213 */
3207 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 3214 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3208 3215
3209 /* 3216 /*
3210 * Compaction records what page blocks it recently failed to 3217 * Compaction records what page blocks it recently failed to
3211 * isolate pages from and skips them in the future scanning. 3218 * isolate pages from and skips them in the future scanning.
3212 * When kswapd is going to sleep, it is reasonable to assume 3219 * When kswapd is going to sleep, it is reasonable to assume
3213 * that pages and compaction may succeed so reset the cache. 3220 * that pages and compaction may succeed so reset the cache.
3214 */ 3221 */
3215 reset_isolation_suitable(pgdat); 3222 reset_isolation_suitable(pgdat);
3216 3223
3217 if (!kthread_should_stop()) 3224 if (!kthread_should_stop())
3218 schedule(); 3225 schedule();
3219 3226
3220 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 3227 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3221 } else { 3228 } else {
3222 if (remaining) 3229 if (remaining)
3223 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 3230 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3224 else 3231 else
3225 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 3232 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3226 } 3233 }
3227 finish_wait(&pgdat->kswapd_wait, &wait); 3234 finish_wait(&pgdat->kswapd_wait, &wait);
3228 } 3235 }
3229 3236
3230 /* 3237 /*
3231 * The background pageout daemon, started as a kernel thread 3238 * The background pageout daemon, started as a kernel thread
3232 * from the init process. 3239 * from the init process.
3233 * 3240 *
3234 * This basically trickles out pages so that we have _some_ 3241 * This basically trickles out pages so that we have _some_
3235 * free memory available even if there is no other activity 3242 * free memory available even if there is no other activity
3236 * that frees anything up. This is needed for things like routing 3243 * that frees anything up. This is needed for things like routing
3237 * etc, where we otherwise might have all activity going on in 3244 * etc, where we otherwise might have all activity going on in
3238 * asynchronous contexts that cannot page things out. 3245 * asynchronous contexts that cannot page things out.
3239 * 3246 *
3240 * If there are applications that are active memory-allocators 3247 * If there are applications that are active memory-allocators
3241 * (most normal use), this basically shouldn't matter. 3248 * (most normal use), this basically shouldn't matter.
3242 */ 3249 */
3243 static int kswapd(void *p) 3250 static int kswapd(void *p)
3244 { 3251 {
3245 unsigned long order, new_order; 3252 unsigned long order, new_order;
3246 unsigned balanced_order; 3253 unsigned balanced_order;
3247 int classzone_idx, new_classzone_idx; 3254 int classzone_idx, new_classzone_idx;
3248 int balanced_classzone_idx; 3255 int balanced_classzone_idx;
3249 pg_data_t *pgdat = (pg_data_t*)p; 3256 pg_data_t *pgdat = (pg_data_t*)p;
3250 struct task_struct *tsk = current; 3257 struct task_struct *tsk = current;
3251 3258
3252 struct reclaim_state reclaim_state = { 3259 struct reclaim_state reclaim_state = {
3253 .reclaimed_slab = 0, 3260 .reclaimed_slab = 0,
3254 }; 3261 };
3255 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3262 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3256 3263
3257 lockdep_set_current_reclaim_state(GFP_KERNEL); 3264 lockdep_set_current_reclaim_state(GFP_KERNEL);
3258 3265
3259 if (!cpumask_empty(cpumask)) 3266 if (!cpumask_empty(cpumask))
3260 set_cpus_allowed_ptr(tsk, cpumask); 3267 set_cpus_allowed_ptr(tsk, cpumask);
3261 current->reclaim_state = &reclaim_state; 3268 current->reclaim_state = &reclaim_state;
3262 3269
3263 /* 3270 /*
3264 * Tell the memory management that we're a "memory allocator", 3271 * Tell the memory management that we're a "memory allocator",
3265 * and that if we need more memory we should get access to it 3272 * and that if we need more memory we should get access to it
3266 * regardless (see "__alloc_pages()"). "kswapd" should 3273 * regardless (see "__alloc_pages()"). "kswapd" should
3267 * never get caught in the normal page freeing logic. 3274 * never get caught in the normal page freeing logic.
3268 * 3275 *
3269 * (Kswapd normally doesn't need memory anyway, but sometimes 3276 * (Kswapd normally doesn't need memory anyway, but sometimes
3270 * you need a small amount of memory in order to be able to 3277 * you need a small amount of memory in order to be able to
3271 * page out something else, and this flag essentially protects 3278 * page out something else, and this flag essentially protects
3272 * us from recursively trying to free more memory as we're 3279 * us from recursively trying to free more memory as we're
3273 * trying to free the first piece of memory in the first place). 3280 * trying to free the first piece of memory in the first place).
3274 */ 3281 */
3275 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3282 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3276 set_freezable(); 3283 set_freezable();
3277 3284
3278 order = new_order = 0; 3285 order = new_order = 0;
3279 balanced_order = 0; 3286 balanced_order = 0;
3280 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3287 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3281 balanced_classzone_idx = classzone_idx; 3288 balanced_classzone_idx = classzone_idx;
3282 for ( ; ; ) { 3289 for ( ; ; ) {
3283 bool ret; 3290 bool ret;
3284 3291
3285 /* 3292 /*
3286 * If the last balance_pgdat was unsuccessful it's unlikely a 3293 * If the last balance_pgdat was unsuccessful it's unlikely a
3287 * new request of a similar or harder type will succeed soon 3294 * new request of a similar or harder type will succeed soon
3288 * so consider going to sleep on the basis we reclaimed at 3295 * so consider going to sleep on the basis we reclaimed at
3289 */ 3296 */
3290 if (balanced_classzone_idx >= new_classzone_idx && 3297 if (balanced_classzone_idx >= new_classzone_idx &&
3291 balanced_order == new_order) { 3298 balanced_order == new_order) {
3292 new_order = pgdat->kswapd_max_order; 3299 new_order = pgdat->kswapd_max_order;
3293 new_classzone_idx = pgdat->classzone_idx; 3300 new_classzone_idx = pgdat->classzone_idx;
3294 pgdat->kswapd_max_order = 0; 3301 pgdat->kswapd_max_order = 0;
3295 pgdat->classzone_idx = pgdat->nr_zones - 1; 3302 pgdat->classzone_idx = pgdat->nr_zones - 1;
3296 } 3303 }
3297 3304
3298 if (order < new_order || classzone_idx > new_classzone_idx) { 3305 if (order < new_order || classzone_idx > new_classzone_idx) {
3299 /* 3306 /*
3300 * Don't sleep if someone wants a larger 'order' 3307 * Don't sleep if someone wants a larger 'order'
3301 * allocation or has tigher zone constraints 3308 * allocation or has tigher zone constraints
3302 */ 3309 */
3303 order = new_order; 3310 order = new_order;
3304 classzone_idx = new_classzone_idx; 3311 classzone_idx = new_classzone_idx;
3305 } else { 3312 } else {
3306 kswapd_try_to_sleep(pgdat, balanced_order, 3313 kswapd_try_to_sleep(pgdat, balanced_order,
3307 balanced_classzone_idx); 3314 balanced_classzone_idx);
3308 order = pgdat->kswapd_max_order; 3315 order = pgdat->kswapd_max_order;
3309 classzone_idx = pgdat->classzone_idx; 3316 classzone_idx = pgdat->classzone_idx;
3310 new_order = order; 3317 new_order = order;
3311 new_classzone_idx = classzone_idx; 3318 new_classzone_idx = classzone_idx;
3312 pgdat->kswapd_max_order = 0; 3319 pgdat->kswapd_max_order = 0;
3313 pgdat->classzone_idx = pgdat->nr_zones - 1; 3320 pgdat->classzone_idx = pgdat->nr_zones - 1;
3314 } 3321 }
3315 3322
3316 ret = try_to_freeze(); 3323 ret = try_to_freeze();
3317 if (kthread_should_stop()) 3324 if (kthread_should_stop())
3318 break; 3325 break;
3319 3326
3320 /* 3327 /*
3321 * We can speed up thawing tasks if we don't call balance_pgdat 3328 * We can speed up thawing tasks if we don't call balance_pgdat
3322 * after returning from the refrigerator 3329 * after returning from the refrigerator
3323 */ 3330 */
3324 if (!ret) { 3331 if (!ret) {
3325 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3332 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3326 balanced_classzone_idx = classzone_idx; 3333 balanced_classzone_idx = classzone_idx;
3327 balanced_order = balance_pgdat(pgdat, order, 3334 balanced_order = balance_pgdat(pgdat, order,
3328 &balanced_classzone_idx); 3335 &balanced_classzone_idx);
3329 } 3336 }
3330 } 3337 }
3331 3338
3332 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3339 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3333 current->reclaim_state = NULL; 3340 current->reclaim_state = NULL;
3334 lockdep_clear_current_reclaim_state(); 3341 lockdep_clear_current_reclaim_state();
3335 3342
3336 return 0; 3343 return 0;
3337 } 3344 }
3338 3345
3339 /* 3346 /*
3340 * A zone is low on free memory, so wake its kswapd task to service it. 3347 * A zone is low on free memory, so wake its kswapd task to service it.
3341 */ 3348 */
3342 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3349 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3343 { 3350 {
3344 pg_data_t *pgdat; 3351 pg_data_t *pgdat;
3345 3352
3346 if (!populated_zone(zone)) 3353 if (!populated_zone(zone))
3347 return; 3354 return;
3348 3355
3349 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 3356 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3350 return; 3357 return;
3351 pgdat = zone->zone_pgdat; 3358 pgdat = zone->zone_pgdat;
3352 if (pgdat->kswapd_max_order < order) { 3359 if (pgdat->kswapd_max_order < order) {
3353 pgdat->kswapd_max_order = order; 3360 pgdat->kswapd_max_order = order;
3354 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 3361 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3355 } 3362 }
3356 if (!waitqueue_active(&pgdat->kswapd_wait)) 3363 if (!waitqueue_active(&pgdat->kswapd_wait))
3357 return; 3364 return;
3358 if (zone_balanced(zone, order, 0, 0)) 3365 if (zone_balanced(zone, order, 0, 0))
3359 return; 3366 return;
3360 3367
3361 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3368 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3362 wake_up_interruptible(&pgdat->kswapd_wait); 3369 wake_up_interruptible(&pgdat->kswapd_wait);
3363 } 3370 }
3364 3371
3365 #ifdef CONFIG_HIBERNATION 3372 #ifdef CONFIG_HIBERNATION
3366 /* 3373 /*
3367 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3374 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
3368 * freed pages. 3375 * freed pages.
3369 * 3376 *
3370 * Rather than trying to age LRUs the aim is to preserve the overall 3377 * Rather than trying to age LRUs the aim is to preserve the overall
3371 * LRU order by reclaiming preferentially 3378 * LRU order by reclaiming preferentially
3372 * inactive > active > active referenced > active mapped 3379 * inactive > active > active referenced > active mapped
3373 */ 3380 */
3374 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3381 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3375 { 3382 {
3376 struct reclaim_state reclaim_state; 3383 struct reclaim_state reclaim_state;
3377 struct scan_control sc = { 3384 struct scan_control sc = {
3378 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3385 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3379 .may_swap = 1, 3386 .may_swap = 1,
3380 .may_unmap = 1, 3387 .may_unmap = 1,
3381 .may_writepage = 1, 3388 .may_writepage = 1,
3382 .nr_to_reclaim = nr_to_reclaim, 3389 .nr_to_reclaim = nr_to_reclaim,
3383 .hibernation_mode = 1, 3390 .hibernation_mode = 1,
3384 .order = 0, 3391 .order = 0,
3385 .priority = DEF_PRIORITY, 3392 .priority = DEF_PRIORITY,
3386 }; 3393 };
3387 struct shrink_control shrink = { 3394 struct shrink_control shrink = {
3388 .gfp_mask = sc.gfp_mask, 3395 .gfp_mask = sc.gfp_mask,
3389 }; 3396 };
3390 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3397 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3391 struct task_struct *p = current; 3398 struct task_struct *p = current;
3392 unsigned long nr_reclaimed; 3399 unsigned long nr_reclaimed;
3393 3400
3394 p->flags |= PF_MEMALLOC; 3401 p->flags |= PF_MEMALLOC;
3395 lockdep_set_current_reclaim_state(sc.gfp_mask); 3402 lockdep_set_current_reclaim_state(sc.gfp_mask);
3396 reclaim_state.reclaimed_slab = 0; 3403 reclaim_state.reclaimed_slab = 0;
3397 p->reclaim_state = &reclaim_state; 3404 p->reclaim_state = &reclaim_state;
3398 3405
3399 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 3406 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3400 3407
3401 p->reclaim_state = NULL; 3408 p->reclaim_state = NULL;
3402 lockdep_clear_current_reclaim_state(); 3409 lockdep_clear_current_reclaim_state();
3403 p->flags &= ~PF_MEMALLOC; 3410 p->flags &= ~PF_MEMALLOC;
3404 3411
3405 return nr_reclaimed; 3412 return nr_reclaimed;
3406 } 3413 }
3407 #endif /* CONFIG_HIBERNATION */ 3414 #endif /* CONFIG_HIBERNATION */
3408 3415
3409 /* It's optimal to keep kswapds on the same CPUs as their memory, but 3416 /* It's optimal to keep kswapds on the same CPUs as their memory, but
3410 not required for correctness. So if the last cpu in a node goes 3417 not required for correctness. So if the last cpu in a node goes
3411 away, we get changed to run anywhere: as the first one comes back, 3418 away, we get changed to run anywhere: as the first one comes back,
3412 restore their cpu bindings. */ 3419 restore their cpu bindings. */
3413 static int cpu_callback(struct notifier_block *nfb, unsigned long action, 3420 static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3414 void *hcpu) 3421 void *hcpu)
3415 { 3422 {
3416 int nid; 3423 int nid;
3417 3424
3418 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3425 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3419 for_each_node_state(nid, N_MEMORY) { 3426 for_each_node_state(nid, N_MEMORY) {
3420 pg_data_t *pgdat = NODE_DATA(nid); 3427 pg_data_t *pgdat = NODE_DATA(nid);
3421 const struct cpumask *mask; 3428 const struct cpumask *mask;
3422 3429
3423 mask = cpumask_of_node(pgdat->node_id); 3430 mask = cpumask_of_node(pgdat->node_id);
3424 3431
3425 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 3432 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3426 /* One of our CPUs online: restore mask */ 3433 /* One of our CPUs online: restore mask */
3427 set_cpus_allowed_ptr(pgdat->kswapd, mask); 3434 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3428 } 3435 }
3429 } 3436 }
3430 return NOTIFY_OK; 3437 return NOTIFY_OK;
3431 } 3438 }
3432 3439
3433 /* 3440 /*
3434 * This kswapd start function will be called by init and node-hot-add. 3441 * This kswapd start function will be called by init and node-hot-add.
3435 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3442 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
3436 */ 3443 */
3437 int kswapd_run(int nid) 3444 int kswapd_run(int nid)
3438 { 3445 {
3439 pg_data_t *pgdat = NODE_DATA(nid); 3446 pg_data_t *pgdat = NODE_DATA(nid);
3440 int ret = 0; 3447 int ret = 0;
3441 3448
3442 if (pgdat->kswapd) 3449 if (pgdat->kswapd)
3443 return 0; 3450 return 0;
3444 3451
3445 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 3452 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3446 if (IS_ERR(pgdat->kswapd)) { 3453 if (IS_ERR(pgdat->kswapd)) {
3447 /* failure at boot is fatal */ 3454 /* failure at boot is fatal */
3448 BUG_ON(system_state == SYSTEM_BOOTING); 3455 BUG_ON(system_state == SYSTEM_BOOTING);
3449 pr_err("Failed to start kswapd on node %d\n", nid); 3456 pr_err("Failed to start kswapd on node %d\n", nid);
3450 ret = PTR_ERR(pgdat->kswapd); 3457 ret = PTR_ERR(pgdat->kswapd);
3451 pgdat->kswapd = NULL; 3458 pgdat->kswapd = NULL;
3452 } 3459 }
3453 return ret; 3460 return ret;
3454 } 3461 }
3455 3462
3456 /* 3463 /*
3457 * Called by memory hotplug when all memory in a node is offlined. Caller must 3464 * Called by memory hotplug when all memory in a node is offlined. Caller must
3458 * hold lock_memory_hotplug(). 3465 * hold lock_memory_hotplug().
3459 */ 3466 */
3460 void kswapd_stop(int nid) 3467 void kswapd_stop(int nid)
3461 { 3468 {
3462 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3469 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3463 3470
3464 if (kswapd) { 3471 if (kswapd) {
3465 kthread_stop(kswapd); 3472 kthread_stop(kswapd);
3466 NODE_DATA(nid)->kswapd = NULL; 3473 NODE_DATA(nid)->kswapd = NULL;
3467 } 3474 }
3468 } 3475 }
3469 3476
3470 static int __init kswapd_init(void) 3477 static int __init kswapd_init(void)
3471 { 3478 {
3472 int nid; 3479 int nid;
3473 3480
3474 swap_setup(); 3481 swap_setup();
3475 for_each_node_state(nid, N_MEMORY) 3482 for_each_node_state(nid, N_MEMORY)
3476 kswapd_run(nid); 3483 kswapd_run(nid);
3477 hotcpu_notifier(cpu_callback, 0); 3484 hotcpu_notifier(cpu_callback, 0);
3478 return 0; 3485 return 0;
3479 } 3486 }
3480 3487
3481 module_init(kswapd_init) 3488 module_init(kswapd_init)
3482 3489
3483 #ifdef CONFIG_NUMA 3490 #ifdef CONFIG_NUMA
3484 /* 3491 /*
3485 * Zone reclaim mode 3492 * Zone reclaim mode
3486 * 3493 *
3487 * If non-zero call zone_reclaim when the number of free pages falls below 3494 * If non-zero call zone_reclaim when the number of free pages falls below
3488 * the watermarks. 3495 * the watermarks.
3489 */ 3496 */
3490 int zone_reclaim_mode __read_mostly; 3497 int zone_reclaim_mode __read_mostly;
3491 3498
3492 #define RECLAIM_OFF 0 3499 #define RECLAIM_OFF 0
3493 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3500 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
3494 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3501 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
3495 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3502 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
3496 3503
3497 /* 3504 /*
3498 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3505 * Priority for ZONE_RECLAIM. This determines the fraction of pages
3499 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3506 * of a node considered for each zone_reclaim. 4 scans 1/16th of
3500 * a zone. 3507 * a zone.
3501 */ 3508 */
3502 #define ZONE_RECLAIM_PRIORITY 4 3509 #define ZONE_RECLAIM_PRIORITY 4
3503 3510
3504 /* 3511 /*
3505 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3512 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
3506 * occur. 3513 * occur.
3507 */ 3514 */
3508 int sysctl_min_unmapped_ratio = 1; 3515 int sysctl_min_unmapped_ratio = 1;
3509 3516
3510 /* 3517 /*
3511 * If the number of slab pages in a zone grows beyond this percentage then 3518 * If the number of slab pages in a zone grows beyond this percentage then
3512 * slab reclaim needs to occur. 3519 * slab reclaim needs to occur.
3513 */ 3520 */
3514 int sysctl_min_slab_ratio = 5; 3521 int sysctl_min_slab_ratio = 5;
3515 3522
3516 static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3523 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3517 { 3524 {
3518 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3525 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3519 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3526 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3520 zone_page_state(zone, NR_ACTIVE_FILE); 3527 zone_page_state(zone, NR_ACTIVE_FILE);
3521 3528
3522 /* 3529 /*
3523 * It's possible for there to be more file mapped pages than 3530 * It's possible for there to be more file mapped pages than
3524 * accounted for by the pages on the file LRU lists because 3531 * accounted for by the pages on the file LRU lists because
3525 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3532 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
3526 */ 3533 */
3527 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3534 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3528 } 3535 }
3529 3536
3530 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3537 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
3531 static long zone_pagecache_reclaimable(struct zone *zone) 3538 static long zone_pagecache_reclaimable(struct zone *zone)
3532 { 3539 {
3533 long nr_pagecache_reclaimable; 3540 long nr_pagecache_reclaimable;
3534 long delta = 0; 3541 long delta = 0;
3535 3542
3536 /* 3543 /*
3537 * If RECLAIM_SWAP is set, then all file pages are considered 3544 * If RECLAIM_SWAP is set, then all file pages are considered
3538 * potentially reclaimable. Otherwise, we have to worry about 3545 * potentially reclaimable. Otherwise, we have to worry about
3539 * pages like swapcache and zone_unmapped_file_pages() provides 3546 * pages like swapcache and zone_unmapped_file_pages() provides
3540 * a better estimate 3547 * a better estimate
3541 */ 3548 */
3542 if (zone_reclaim_mode & RECLAIM_SWAP) 3549 if (zone_reclaim_mode & RECLAIM_SWAP)
3543 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3550 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3544 else 3551 else
3545 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3552 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3546 3553
3547 /* If we can't clean pages, remove dirty pages from consideration */ 3554 /* If we can't clean pages, remove dirty pages from consideration */
3548 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3555 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3549 delta += zone_page_state(zone, NR_FILE_DIRTY); 3556 delta += zone_page_state(zone, NR_FILE_DIRTY);
3550 3557
3551 /* Watch for any possible underflows due to delta */ 3558 /* Watch for any possible underflows due to delta */
3552 if (unlikely(delta > nr_pagecache_reclaimable)) 3559 if (unlikely(delta > nr_pagecache_reclaimable))
3553 delta = nr_pagecache_reclaimable; 3560 delta = nr_pagecache_reclaimable;
3554 3561
3555 return nr_pagecache_reclaimable - delta; 3562 return nr_pagecache_reclaimable - delta;
3556 } 3563 }
3557 3564
3558 /* 3565 /*
3559 * Try to free up some pages from this zone through reclaim. 3566 * Try to free up some pages from this zone through reclaim.
3560 */ 3567 */
3561 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3568 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3562 { 3569 {
3563 /* Minimum pages needed in order to stay on node */ 3570 /* Minimum pages needed in order to stay on node */
3564 const unsigned long nr_pages = 1 << order; 3571 const unsigned long nr_pages = 1 << order;
3565 struct task_struct *p = current; 3572 struct task_struct *p = current;
3566 struct reclaim_state reclaim_state; 3573 struct reclaim_state reclaim_state;
3567 struct scan_control sc = { 3574 struct scan_control sc = {
3568 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3575 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3569 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3576 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3570 .may_swap = 1, 3577 .may_swap = 1,
3571 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3578 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3572 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 3579 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3573 .order = order, 3580 .order = order,
3574 .priority = ZONE_RECLAIM_PRIORITY, 3581 .priority = ZONE_RECLAIM_PRIORITY,
3575 }; 3582 };
3576 struct shrink_control shrink = { 3583 struct shrink_control shrink = {
3577 .gfp_mask = sc.gfp_mask, 3584 .gfp_mask = sc.gfp_mask,
3578 }; 3585 };
3579 unsigned long nr_slab_pages0, nr_slab_pages1; 3586 unsigned long nr_slab_pages0, nr_slab_pages1;
3580 3587
3581 cond_resched(); 3588 cond_resched();
3582 /* 3589 /*
3583 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3590 * We need to be able to allocate from the reserves for RECLAIM_SWAP
3584 * and we also need to be able to write out pages for RECLAIM_WRITE 3591 * and we also need to be able to write out pages for RECLAIM_WRITE
3585 * and RECLAIM_SWAP. 3592 * and RECLAIM_SWAP.
3586 */ 3593 */
3587 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3594 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3588 lockdep_set_current_reclaim_state(gfp_mask); 3595 lockdep_set_current_reclaim_state(gfp_mask);
3589 reclaim_state.reclaimed_slab = 0; 3596 reclaim_state.reclaimed_slab = 0;
3590 p->reclaim_state = &reclaim_state; 3597 p->reclaim_state = &reclaim_state;
3591 3598
3592 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3599 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3593 /* 3600 /*
3594 * Free memory by calling shrink zone with increasing 3601 * Free memory by calling shrink zone with increasing
3595 * priorities until we have enough memory freed. 3602 * priorities until we have enough memory freed.
3596 */ 3603 */
3597 do { 3604 do {
3598 shrink_zone(zone, &sc); 3605 shrink_zone(zone, &sc);
3599 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3606 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3600 } 3607 }
3601 3608
3602 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3609 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3603 if (nr_slab_pages0 > zone->min_slab_pages) { 3610 if (nr_slab_pages0 > zone->min_slab_pages) {
3604 /* 3611 /*
3605 * shrink_slab() does not currently allow us to determine how 3612 * shrink_slab() does not currently allow us to determine how
3606 * many pages were freed in this zone. So we take the current 3613 * many pages were freed in this zone. So we take the current
3607 * number of slab pages and shake the slab until it is reduced 3614 * number of slab pages and shake the slab until it is reduced
3608 * by the same nr_pages that we used for reclaiming unmapped 3615 * by the same nr_pages that we used for reclaiming unmapped
3609 * pages. 3616 * pages.
3610 */ 3617 */
3611 nodes_clear(shrink.nodes_to_scan); 3618 nodes_clear(shrink.nodes_to_scan);
3612 node_set(zone_to_nid(zone), shrink.nodes_to_scan); 3619 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3613 for (;;) { 3620 for (;;) {
3614 unsigned long lru_pages = zone_reclaimable_pages(zone); 3621 unsigned long lru_pages = zone_reclaimable_pages(zone);
3615 3622
3616 /* No reclaimable slab or very low memory pressure */ 3623 /* No reclaimable slab or very low memory pressure */
3617 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3624 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3618 break; 3625 break;
3619 3626
3620 /* Freed enough memory */ 3627 /* Freed enough memory */
3621 nr_slab_pages1 = zone_page_state(zone, 3628 nr_slab_pages1 = zone_page_state(zone,
3622 NR_SLAB_RECLAIMABLE); 3629 NR_SLAB_RECLAIMABLE);
3623 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3630 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3624 break; 3631 break;
3625 } 3632 }
3626 3633
3627 /* 3634 /*
3628 * Update nr_reclaimed by the number of slab pages we 3635 * Update nr_reclaimed by the number of slab pages we
3629 * reclaimed from this zone. 3636 * reclaimed from this zone.
3630 */ 3637 */
3631 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3638 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3632 if (nr_slab_pages1 < nr_slab_pages0) 3639 if (nr_slab_pages1 < nr_slab_pages0)
3633 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3640 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3634 } 3641 }
3635 3642
3636 p->reclaim_state = NULL; 3643 p->reclaim_state = NULL;
3637 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3644 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3638 lockdep_clear_current_reclaim_state(); 3645 lockdep_clear_current_reclaim_state();
3639 return sc.nr_reclaimed >= nr_pages; 3646 return sc.nr_reclaimed >= nr_pages;
3640 } 3647 }
3641 3648
3642 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3649 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3643 { 3650 {
3644 int node_id; 3651 int node_id;
3645 int ret; 3652 int ret;
3646 3653
3647 /* 3654 /*
3648 * Zone reclaim reclaims unmapped file backed pages and 3655 * Zone reclaim reclaims unmapped file backed pages and
3649 * slab pages if we are over the defined limits. 3656 * slab pages if we are over the defined limits.
3650 * 3657 *
3651 * A small portion of unmapped file backed pages is needed for 3658 * A small portion of unmapped file backed pages is needed for
3652 * file I/O otherwise pages read by file I/O will be immediately 3659 * file I/O otherwise pages read by file I/O will be immediately
3653 * thrown out if the zone is overallocated. So we do not reclaim 3660 * thrown out if the zone is overallocated. So we do not reclaim
3654 * if less than a specified percentage of the zone is used by 3661 * if less than a specified percentage of the zone is used by
3655 * unmapped file backed pages. 3662 * unmapped file backed pages.
3656 */ 3663 */
3657 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3664 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3658 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3665 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3659 return ZONE_RECLAIM_FULL; 3666 return ZONE_RECLAIM_FULL;
3660 3667
3661 if (!zone_reclaimable(zone)) 3668 if (!zone_reclaimable(zone))
3662 return ZONE_RECLAIM_FULL; 3669 return ZONE_RECLAIM_FULL;
3663 3670
3664 /* 3671 /*
3665 * Do not scan if the allocation should not be delayed. 3672 * Do not scan if the allocation should not be delayed.
3666 */ 3673 */
3667 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3674 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3668 return ZONE_RECLAIM_NOSCAN; 3675 return ZONE_RECLAIM_NOSCAN;
3669 3676
3670 /* 3677 /*
3671 * Only run zone reclaim on the local zone or on zones that do not 3678 * Only run zone reclaim on the local zone or on zones that do not
3672 * have associated processors. This will favor the local processor 3679 * have associated processors. This will favor the local processor
3673 * over remote processors and spread off node memory allocations 3680 * over remote processors and spread off node memory allocations
3674 * as wide as possible. 3681 * as wide as possible.
3675 */ 3682 */
3676 node_id = zone_to_nid(zone); 3683 node_id = zone_to_nid(zone);
3677 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3684 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3678 return ZONE_RECLAIM_NOSCAN; 3685 return ZONE_RECLAIM_NOSCAN;
3679 3686
3680 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3687 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3681 return ZONE_RECLAIM_NOSCAN; 3688 return ZONE_RECLAIM_NOSCAN;
3682 3689
3683 ret = __zone_reclaim(zone, gfp_mask, order); 3690 ret = __zone_reclaim(zone, gfp_mask, order);
3684 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3691 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3685 3692
3686 if (!ret) 3693 if (!ret)
3687 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3694 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3688 3695
3689 return ret; 3696 return ret;
3690 } 3697 }
3691 #endif 3698 #endif
3692 3699
3693 /* 3700 /*
3694 * page_evictable - test whether a page is evictable 3701 * page_evictable - test whether a page is evictable
3695 * @page: the page to test 3702 * @page: the page to test
3696 * 3703 *
3697 * Test whether page is evictable--i.e., should be placed on active/inactive 3704 * Test whether page is evictable--i.e., should be placed on active/inactive
3698 * lists vs unevictable list. 3705 * lists vs unevictable list.
3699 * 3706 *
3700 * Reasons page might not be evictable: 3707 * Reasons page might not be evictable:
3701 * (1) page's mapping marked unevictable 3708 * (1) page's mapping marked unevictable
3702 * (2) page is part of an mlocked VMA 3709 * (2) page is part of an mlocked VMA
3703 * 3710 *
3704 */ 3711 */
3705 int page_evictable(struct page *page) 3712 int page_evictable(struct page *page)
3706 { 3713 {
3707 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); 3714 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3708 } 3715 }
3709 3716
3710 #ifdef CONFIG_SHMEM 3717 #ifdef CONFIG_SHMEM
3711 /** 3718 /**
3712 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list 3719 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
3713 * @pages: array of pages to check 3720 * @pages: array of pages to check
3714 * @nr_pages: number of pages to check 3721 * @nr_pages: number of pages to check
3715 * 3722 *
3716 * Checks pages for evictability and moves them to the appropriate lru list. 3723 * Checks pages for evictability and moves them to the appropriate lru list.
3717 * 3724 *
3718 * This function is only used for SysV IPC SHM_UNLOCK. 3725 * This function is only used for SysV IPC SHM_UNLOCK.
3719 */ 3726 */
3720 void check_move_unevictable_pages(struct page **pages, int nr_pages) 3727 void check_move_unevictable_pages(struct page **pages, int nr_pages)
3721 { 3728 {
3722 struct lruvec *lruvec; 3729 struct lruvec *lruvec;
3723 struct zone *zone = NULL; 3730 struct zone *zone = NULL;
3724 int pgscanned = 0; 3731 int pgscanned = 0;
3725 int pgrescued = 0; 3732 int pgrescued = 0;
3726 int i; 3733 int i;
3727 3734
3728 for (i = 0; i < nr_pages; i++) { 3735 for (i = 0; i < nr_pages; i++) {
3729 struct page *page = pages[i]; 3736 struct page *page = pages[i];
3730 struct zone *pagezone; 3737 struct zone *pagezone;
3731 3738
3732 pgscanned++; 3739 pgscanned++;
3733 pagezone = page_zone(page); 3740 pagezone = page_zone(page);
3734 if (pagezone != zone) { 3741 if (pagezone != zone) {
3735 if (zone) 3742 if (zone)
3736 spin_unlock_irq(&zone->lru_lock); 3743 spin_unlock_irq(&zone->lru_lock);
3737 zone = pagezone; 3744 zone = pagezone;
3738 spin_lock_irq(&zone->lru_lock); 3745 spin_lock_irq(&zone->lru_lock);
3739 } 3746 }
3740 lruvec = mem_cgroup_page_lruvec(page, zone); 3747 lruvec = mem_cgroup_page_lruvec(page, zone);
3741 3748
3742 if (!PageLRU(page) || !PageUnevictable(page)) 3749 if (!PageLRU(page) || !PageUnevictable(page))
3743 continue; 3750 continue;
3744 3751
3745 if (page_evictable(page)) { 3752 if (page_evictable(page)) {
3746 enum lru_list lru = page_lru_base_type(page); 3753 enum lru_list lru = page_lru_base_type(page);
3747 3754
3748 VM_BUG_ON(PageActive(page)); 3755 VM_BUG_ON(PageActive(page));
3749 ClearPageUnevictable(page); 3756 ClearPageUnevictable(page);
3750 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); 3757 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3751 add_page_to_lru_list(page, lruvec, lru); 3758 add_page_to_lru_list(page, lruvec, lru);
3752 pgrescued++; 3759 pgrescued++;
3753 } 3760 }
3754 } 3761 }
3755 3762
3756 if (zone) { 3763 if (zone) {
3757 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 3764 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3758 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 3765 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3759 spin_unlock_irq(&zone->lru_lock); 3766 spin_unlock_irq(&zone->lru_lock);
3760 } 3767 }
3761 } 3768 }
3762 #endif /* CONFIG_SHMEM */ 3769 #endif /* CONFIG_SHMEM */
3763 3770
3764 static void warn_scan_unevictable_pages(void) 3771 static void warn_scan_unevictable_pages(void)
3765 { 3772 {
3766 printk_once(KERN_WARNING 3773 printk_once(KERN_WARNING
3767 "%s: The scan_unevictable_pages sysctl/node-interface has been " 3774 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3768 "disabled for lack of a legitimate use case. If you have " 3775 "disabled for lack of a legitimate use case. If you have "
3769 "one, please send an email to linux-mm@kvack.org.\n", 3776 "one, please send an email to linux-mm@kvack.org.\n",
3770 current->comm); 3777 current->comm);
3771 } 3778 }
3772 3779
3773 /* 3780 /*
3774 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3781 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
3775 * all nodes' unevictable lists for evictable pages 3782 * all nodes' unevictable lists for evictable pages
3776 */ 3783 */
3777 unsigned long scan_unevictable_pages; 3784 unsigned long scan_unevictable_pages;
3778 3785
3779 int scan_unevictable_handler(struct ctl_table *table, int write, 3786 int scan_unevictable_handler(struct ctl_table *table, int write,
3780 void __user *buffer, 3787 void __user *buffer,
3781 size_t *length, loff_t *ppos) 3788 size_t *length, loff_t *ppos)
3782 { 3789 {
3783 warn_scan_unevictable_pages(); 3790 warn_scan_unevictable_pages();
3784 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3791 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3785 scan_unevictable_pages = 0; 3792 scan_unevictable_pages = 0;
3786 return 0; 3793 return 0;
3787 } 3794 }
3788 3795
3789 #ifdef CONFIG_NUMA 3796 #ifdef CONFIG_NUMA
3790 /* 3797 /*
3791 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3798 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
3792 * a specified node's per zone unevictable lists for evictable pages. 3799 * a specified node's per zone unevictable lists for evictable pages.
3793 */ 3800 */
3794 3801
3795 static ssize_t read_scan_unevictable_node(struct device *dev, 3802 static ssize_t read_scan_unevictable_node(struct device *dev,
3796 struct device_attribute *attr, 3803 struct device_attribute *attr,
3797 char *buf) 3804 char *buf)
3798 { 3805 {
3799 warn_scan_unevictable_pages(); 3806 warn_scan_unevictable_pages();
3800 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3807 return sprintf(buf, "0\n"); /* always zero; should fit... */
3801 } 3808 }
3802 3809
3803 static ssize_t write_scan_unevictable_node(struct device *dev, 3810 static ssize_t write_scan_unevictable_node(struct device *dev,
3804 struct device_attribute *attr, 3811 struct device_attribute *attr,
3805 const char *buf, size_t count) 3812 const char *buf, size_t count)
3806 { 3813 {
3807 warn_scan_unevictable_pages(); 3814 warn_scan_unevictable_pages();
3808 return 1; 3815 return 1;
3809 } 3816 }
3810 3817
3811 3818
3812 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3819 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3813 read_scan_unevictable_node, 3820 read_scan_unevictable_node,
3814 write_scan_unevictable_node); 3821 write_scan_unevictable_node);
3815 3822
3816 int scan_unevictable_register_node(struct node *node) 3823 int scan_unevictable_register_node(struct node *node)
3817 { 3824 {
3818 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); 3825 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3819 } 3826 }
3820 3827
3821 void scan_unevictable_unregister_node(struct node *node) 3828 void scan_unevictable_unregister_node(struct node *node)