Commit 3567b59aa80ac4417002bf58e35dce5c777d4164

Authored by Dave Chinner
Committed by Al Viro
1 parent acf92b485c

vmscan: reduce wind up shrinker->nr when shrinker can't do work

When a shrinker returns -1 to shrink_slab() to indicate it cannot do
any work given the current memory reclaim requirements, it adds the
entire total_scan count to shrinker->nr. The idea ehind this is that
whenteh shrinker is next called and can do work, it will do the work
of the previously aborted shrinker call as well.

However, if a filesystem is doing lots of allocation with GFP_NOFS
set, then we get many, many more aborts from the shrinkers than we
do successful calls. The result is that shrinker->nr winds up to
it's maximum permissible value (twice the current cache size) and
then when the next shrinker call that can do work is issued, it
has enough scan count built up to free the entire cache twice over.

This manifests itself in the cache going from full to empty in a
matter of seconds, even when only a small part of the cache is
needed to be emptied to free sufficient memory.

Under metadata intensive workloads on ext4 and XFS, I'm seeing the
VFS caches increase memory consumption up to 75% of memory (no page
cache pressure) over a period of 30-60s, and then the shrinker
empties them down to zero in the space of 2-3s. This cycle repeats
over and over again, with the shrinker completely trashing the inode
and dentry caches every minute or so the workload continues.

This behaviour was made obvious by the shrink_slab tracepoints added
earlier in the series, and made worse by the patch that corrected
the concurrent accounting of shrinker->nr.

To avoid this problem, stop repeated small increments of the total
scan value from winding shrinker->nr up to a value that can cause
the entire cache to be freed. We still need to allow it to wind up,
so use the delta as the "large scan" threshold check - if the delta
is more than a quarter of the entire cache size, then it is a large
scan and allowed to cause lots of windup because we are clearly
needing to free lots of memory.

If it isn't a large scan then limit the total scan to half the size
of the cache so that windup never increases to consume the whole
cache. Reducing the total scan limit further does not allow enough
wind-up to maintain the current levels of performance, whilst a
higher threshold does not prevent the windup from freeing the entire
cache under sustained workloads.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 1 changed file with 15 additions and 0 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/vmscan.c 2 * linux/mm/vmscan.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * 5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct 7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 * Multiqueue VM started 5.8.00, Rik van Riel. 11 * Multiqueue VM started 5.8.00, Rik van Riel.
12 */ 12 */
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/gfp.h> 16 #include <linux/gfp.h>
17 #include <linux/kernel_stat.h> 17 #include <linux/kernel_stat.h>
18 #include <linux/swap.h> 18 #include <linux/swap.h>
19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h>
20 #include <linux/init.h> 20 #include <linux/init.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/vmstat.h> 22 #include <linux/vmstat.h>
23 #include <linux/file.h> 23 #include <linux/file.h>
24 #include <linux/writeback.h> 24 #include <linux/writeback.h>
25 #include <linux/blkdev.h> 25 #include <linux/blkdev.h>
26 #include <linux/buffer_head.h> /* for try_to_release_page(), 26 #include <linux/buffer_head.h> /* for try_to_release_page(),
27 buffer_heads_over_limit */ 27 buffer_heads_over_limit */
28 #include <linux/mm_inline.h> 28 #include <linux/mm_inline.h>
29 #include <linux/pagevec.h> 29 #include <linux/pagevec.h>
30 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h>
31 #include <linux/rmap.h> 31 #include <linux/rmap.h>
32 #include <linux/topology.h> 32 #include <linux/topology.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/cpuset.h> 34 #include <linux/cpuset.h>
35 #include <linux/compaction.h> 35 #include <linux/compaction.h>
36 #include <linux/notifier.h> 36 #include <linux/notifier.h>
37 #include <linux/rwsem.h> 37 #include <linux/rwsem.h>
38 #include <linux/delay.h> 38 #include <linux/delay.h>
39 #include <linux/kthread.h> 39 #include <linux/kthread.h>
40 #include <linux/freezer.h> 40 #include <linux/freezer.h>
41 #include <linux/memcontrol.h> 41 #include <linux/memcontrol.h>
42 #include <linux/delayacct.h> 42 #include <linux/delayacct.h>
43 #include <linux/sysctl.h> 43 #include <linux/sysctl.h>
44 #include <linux/oom.h> 44 #include <linux/oom.h>
45 #include <linux/prefetch.h> 45 #include <linux/prefetch.h>
46 46
47 #include <asm/tlbflush.h> 47 #include <asm/tlbflush.h>
48 #include <asm/div64.h> 48 #include <asm/div64.h>
49 49
50 #include <linux/swapops.h> 50 #include <linux/swapops.h>
51 51
52 #include "internal.h" 52 #include "internal.h"
53 53
54 #define CREATE_TRACE_POINTS 54 #define CREATE_TRACE_POINTS
55 #include <trace/events/vmscan.h> 55 #include <trace/events/vmscan.h>
56 56
57 /* 57 /*
58 * reclaim_mode determines how the inactive list is shrunk 58 * reclaim_mode determines how the inactive list is shrunk
59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages 59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
60 * RECLAIM_MODE_ASYNC: Do not block 60 * RECLAIM_MODE_ASYNC: Do not block
61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback 61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference 62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
63 * page from the LRU and reclaim all pages within a 63 * page from the LRU and reclaim all pages within a
64 * naturally aligned range 64 * naturally aligned range
65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of 65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
66 * order-0 pages and then compact the zone 66 * order-0 pages and then compact the zone
67 */ 67 */
68 typedef unsigned __bitwise__ reclaim_mode_t; 68 typedef unsigned __bitwise__ reclaim_mode_t;
69 #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) 69 #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
70 #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) 70 #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
71 #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) 71 #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
72 #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) 72 #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
73 #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) 73 #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
74 74
75 struct scan_control { 75 struct scan_control {
76 /* Incremented by the number of inactive pages that were scanned */ 76 /* Incremented by the number of inactive pages that were scanned */
77 unsigned long nr_scanned; 77 unsigned long nr_scanned;
78 78
79 /* Number of pages freed so far during a call to shrink_zones() */ 79 /* Number of pages freed so far during a call to shrink_zones() */
80 unsigned long nr_reclaimed; 80 unsigned long nr_reclaimed;
81 81
82 /* How many pages shrink_list() should reclaim */ 82 /* How many pages shrink_list() should reclaim */
83 unsigned long nr_to_reclaim; 83 unsigned long nr_to_reclaim;
84 84
85 unsigned long hibernation_mode; 85 unsigned long hibernation_mode;
86 86
87 /* This context's GFP mask */ 87 /* This context's GFP mask */
88 gfp_t gfp_mask; 88 gfp_t gfp_mask;
89 89
90 int may_writepage; 90 int may_writepage;
91 91
92 /* Can mapped pages be reclaimed? */ 92 /* Can mapped pages be reclaimed? */
93 int may_unmap; 93 int may_unmap;
94 94
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness; 98 int swappiness;
99 99
100 int order; 100 int order;
101 101
102 /* 102 /*
103 * Intend to reclaim enough continuous memory rather than reclaim 103 * Intend to reclaim enough continuous memory rather than reclaim
104 * enough amount of memory. i.e, mode for high order allocation. 104 * enough amount of memory. i.e, mode for high order allocation.
105 */ 105 */
106 reclaim_mode_t reclaim_mode; 106 reclaim_mode_t reclaim_mode;
107 107
108 /* Which cgroup do we reclaim from */ 108 /* Which cgroup do we reclaim from */
109 struct mem_cgroup *mem_cgroup; 109 struct mem_cgroup *mem_cgroup;
110 110
111 /* 111 /*
112 * Nodemask of nodes allowed by the caller. If NULL, all nodes 112 * Nodemask of nodes allowed by the caller. If NULL, all nodes
113 * are scanned. 113 * are scanned.
114 */ 114 */
115 nodemask_t *nodemask; 115 nodemask_t *nodemask;
116 }; 116 };
117 117
118 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 118 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
119 119
120 #ifdef ARCH_HAS_PREFETCH 120 #ifdef ARCH_HAS_PREFETCH
121 #define prefetch_prev_lru_page(_page, _base, _field) \ 121 #define prefetch_prev_lru_page(_page, _base, _field) \
122 do { \ 122 do { \
123 if ((_page)->lru.prev != _base) { \ 123 if ((_page)->lru.prev != _base) { \
124 struct page *prev; \ 124 struct page *prev; \
125 \ 125 \
126 prev = lru_to_page(&(_page->lru)); \ 126 prev = lru_to_page(&(_page->lru)); \
127 prefetch(&prev->_field); \ 127 prefetch(&prev->_field); \
128 } \ 128 } \
129 } while (0) 129 } while (0)
130 #else 130 #else
131 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 131 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
132 #endif 132 #endif
133 133
134 #ifdef ARCH_HAS_PREFETCHW 134 #ifdef ARCH_HAS_PREFETCHW
135 #define prefetchw_prev_lru_page(_page, _base, _field) \ 135 #define prefetchw_prev_lru_page(_page, _base, _field) \
136 do { \ 136 do { \
137 if ((_page)->lru.prev != _base) { \ 137 if ((_page)->lru.prev != _base) { \
138 struct page *prev; \ 138 struct page *prev; \
139 \ 139 \
140 prev = lru_to_page(&(_page->lru)); \ 140 prev = lru_to_page(&(_page->lru)); \
141 prefetchw(&prev->_field); \ 141 prefetchw(&prev->_field); \
142 } \ 142 } \
143 } while (0) 143 } while (0)
144 #else 144 #else
145 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 145 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
146 #endif 146 #endif
147 147
148 /* 148 /*
149 * From 0 .. 100. Higher means more swappy. 149 * From 0 .. 100. Higher means more swappy.
150 */ 150 */
151 int vm_swappiness = 60; 151 int vm_swappiness = 60;
152 long vm_total_pages; /* The total number of pages which the VM controls */ 152 long vm_total_pages; /* The total number of pages which the VM controls */
153 153
154 static LIST_HEAD(shrinker_list); 154 static LIST_HEAD(shrinker_list);
155 static DECLARE_RWSEM(shrinker_rwsem); 155 static DECLARE_RWSEM(shrinker_rwsem);
156 156
157 #ifdef CONFIG_CGROUP_MEM_RES_CTLR 157 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
158 #define scanning_global_lru(sc) (!(sc)->mem_cgroup) 158 #define scanning_global_lru(sc) (!(sc)->mem_cgroup)
159 #else 159 #else
160 #define scanning_global_lru(sc) (1) 160 #define scanning_global_lru(sc) (1)
161 #endif 161 #endif
162 162
163 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, 163 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
164 struct scan_control *sc) 164 struct scan_control *sc)
165 { 165 {
166 if (!scanning_global_lru(sc)) 166 if (!scanning_global_lru(sc))
167 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); 167 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
168 168
169 return &zone->reclaim_stat; 169 return &zone->reclaim_stat;
170 } 170 }
171 171
172 static unsigned long zone_nr_lru_pages(struct zone *zone, 172 static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 173 struct scan_control *sc, enum lru_list lru)
174 { 174 {
175 if (!scanning_global_lru(sc)) 175 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
177 177
178 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
179 } 179 }
180 180
181 181
182 /* 182 /*
183 * Add a shrinker callback to be called from the vm 183 * Add a shrinker callback to be called from the vm
184 */ 184 */
185 void register_shrinker(struct shrinker *shrinker) 185 void register_shrinker(struct shrinker *shrinker)
186 { 186 {
187 shrinker->nr = 0; 187 shrinker->nr = 0;
188 down_write(&shrinker_rwsem); 188 down_write(&shrinker_rwsem);
189 list_add_tail(&shrinker->list, &shrinker_list); 189 list_add_tail(&shrinker->list, &shrinker_list);
190 up_write(&shrinker_rwsem); 190 up_write(&shrinker_rwsem);
191 } 191 }
192 EXPORT_SYMBOL(register_shrinker); 192 EXPORT_SYMBOL(register_shrinker);
193 193
194 /* 194 /*
195 * Remove one 195 * Remove one
196 */ 196 */
197 void unregister_shrinker(struct shrinker *shrinker) 197 void unregister_shrinker(struct shrinker *shrinker)
198 { 198 {
199 down_write(&shrinker_rwsem); 199 down_write(&shrinker_rwsem);
200 list_del(&shrinker->list); 200 list_del(&shrinker->list);
201 up_write(&shrinker_rwsem); 201 up_write(&shrinker_rwsem);
202 } 202 }
203 EXPORT_SYMBOL(unregister_shrinker); 203 EXPORT_SYMBOL(unregister_shrinker);
204 204
205 static inline int do_shrinker_shrink(struct shrinker *shrinker, 205 static inline int do_shrinker_shrink(struct shrinker *shrinker,
206 struct shrink_control *sc, 206 struct shrink_control *sc,
207 unsigned long nr_to_scan) 207 unsigned long nr_to_scan)
208 { 208 {
209 sc->nr_to_scan = nr_to_scan; 209 sc->nr_to_scan = nr_to_scan;
210 return (*shrinker->shrink)(shrinker, sc); 210 return (*shrinker->shrink)(shrinker, sc);
211 } 211 }
212 212
213 #define SHRINK_BATCH 128 213 #define SHRINK_BATCH 128
214 /* 214 /*
215 * Call the shrink functions to age shrinkable caches 215 * Call the shrink functions to age shrinkable caches
216 * 216 *
217 * Here we assume it costs one seek to replace a lru page and that it also 217 * Here we assume it costs one seek to replace a lru page and that it also
218 * takes a seek to recreate a cache object. With this in mind we age equal 218 * takes a seek to recreate a cache object. With this in mind we age equal
219 * percentages of the lru and ageable caches. This should balance the seeks 219 * percentages of the lru and ageable caches. This should balance the seeks
220 * generated by these structures. 220 * generated by these structures.
221 * 221 *
222 * If the vm encountered mapped pages on the LRU it increase the pressure on 222 * If the vm encountered mapped pages on the LRU it increase the pressure on
223 * slab to avoid swapping. 223 * slab to avoid swapping.
224 * 224 *
225 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 225 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
226 * 226 *
227 * `lru_pages' represents the number of on-LRU pages in all the zones which 227 * `lru_pages' represents the number of on-LRU pages in all the zones which
228 * are eligible for the caller's allocation attempt. It is used for balancing 228 * are eligible for the caller's allocation attempt. It is used for balancing
229 * slab reclaim versus page reclaim. 229 * slab reclaim versus page reclaim.
230 * 230 *
231 * Returns the number of slab objects which we shrunk. 231 * Returns the number of slab objects which we shrunk.
232 */ 232 */
233 unsigned long shrink_slab(struct shrink_control *shrink, 233 unsigned long shrink_slab(struct shrink_control *shrink,
234 unsigned long nr_pages_scanned, 234 unsigned long nr_pages_scanned,
235 unsigned long lru_pages) 235 unsigned long lru_pages)
236 { 236 {
237 struct shrinker *shrinker; 237 struct shrinker *shrinker;
238 unsigned long ret = 0; 238 unsigned long ret = 0;
239 239
240 if (nr_pages_scanned == 0) 240 if (nr_pages_scanned == 0)
241 nr_pages_scanned = SWAP_CLUSTER_MAX; 241 nr_pages_scanned = SWAP_CLUSTER_MAX;
242 242
243 if (!down_read_trylock(&shrinker_rwsem)) { 243 if (!down_read_trylock(&shrinker_rwsem)) {
244 /* Assume we'll be able to shrink next time */ 244 /* Assume we'll be able to shrink next time */
245 ret = 1; 245 ret = 1;
246 goto out; 246 goto out;
247 } 247 }
248 248
249 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 unsigned long total_scan;
252 unsigned long max_pass; 252 unsigned long max_pass;
253 int shrink_ret = 0; 253 int shrink_ret = 0;
254 long nr; 254 long nr;
255 long new_nr; 255 long new_nr;
256 256
257 /* 257 /*
258 * copy the current shrinker scan count into a local variable 258 * copy the current shrinker scan count into a local variable
259 * and zero it so that other concurrent shrinker invocations 259 * and zero it so that other concurrent shrinker invocations
260 * don't also do this scanning work. 260 * don't also do this scanning work.
261 */ 261 */
262 do { 262 do {
263 nr = shrinker->nr; 263 nr = shrinker->nr;
264 } while (cmpxchg(&shrinker->nr, nr, 0) != nr); 264 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
265 265
266 total_scan = nr; 266 total_scan = nr;
267 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 267 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
268 delta = (4 * nr_pages_scanned) / shrinker->seeks; 268 delta = (4 * nr_pages_scanned) / shrinker->seeks;
269 delta *= max_pass; 269 delta *= max_pass;
270 do_div(delta, lru_pages + 1); 270 do_div(delta, lru_pages + 1);
271 total_scan += delta; 271 total_scan += delta;
272 if (total_scan < 0) { 272 if (total_scan < 0) {
273 printk(KERN_ERR "shrink_slab: %pF negative objects to " 273 printk(KERN_ERR "shrink_slab: %pF negative objects to "
274 "delete nr=%ld\n", 274 "delete nr=%ld\n",
275 shrinker->shrink, total_scan); 275 shrinker->shrink, total_scan);
276 total_scan = max_pass; 276 total_scan = max_pass;
277 } 277 }
278 278
279 /* 279 /*
280 * We need to avoid excessive windup on filesystem shrinkers
281 * due to large numbers of GFP_NOFS allocations causing the
282 * shrinkers to return -1 all the time. This results in a large
283 * nr being built up so when a shrink that can do some work
284 * comes along it empties the entire cache due to nr >>>
285 * max_pass. This is bad for sustaining a working set in
286 * memory.
287 *
288 * Hence only allow the shrinker to scan the entire cache when
289 * a large delta change is calculated directly.
290 */
291 if (delta < max_pass / 4)
292 total_scan = min(total_scan, max_pass / 2);
293
294 /*
280 * Avoid risking looping forever due to too large nr value: 295 * Avoid risking looping forever due to too large nr value:
281 * never try to free more than twice the estimate number of 296 * never try to free more than twice the estimate number of
282 * freeable entries. 297 * freeable entries.
283 */ 298 */
284 if (total_scan > max_pass * 2) 299 if (total_scan > max_pass * 2)
285 total_scan = max_pass * 2; 300 total_scan = max_pass * 2;
286 301
287 trace_mm_shrink_slab_start(shrinker, shrink, nr, 302 trace_mm_shrink_slab_start(shrinker, shrink, nr,
288 nr_pages_scanned, lru_pages, 303 nr_pages_scanned, lru_pages,
289 max_pass, delta, total_scan); 304 max_pass, delta, total_scan);
290 305
291 while (total_scan >= SHRINK_BATCH) { 306 while (total_scan >= SHRINK_BATCH) {
292 long this_scan = SHRINK_BATCH; 307 long this_scan = SHRINK_BATCH;
293 int nr_before; 308 int nr_before;
294 309
295 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 310 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
296 shrink_ret = do_shrinker_shrink(shrinker, shrink, 311 shrink_ret = do_shrinker_shrink(shrinker, shrink,
297 this_scan); 312 this_scan);
298 if (shrink_ret == -1) 313 if (shrink_ret == -1)
299 break; 314 break;
300 if (shrink_ret < nr_before) 315 if (shrink_ret < nr_before)
301 ret += nr_before - shrink_ret; 316 ret += nr_before - shrink_ret;
302 count_vm_events(SLABS_SCANNED, this_scan); 317 count_vm_events(SLABS_SCANNED, this_scan);
303 total_scan -= this_scan; 318 total_scan -= this_scan;
304 319
305 cond_resched(); 320 cond_resched();
306 } 321 }
307 322
308 /* 323 /*
309 * move the unused scan count back into the shrinker in a 324 * move the unused scan count back into the shrinker in a
310 * manner that handles concurrent updates. If we exhausted the 325 * manner that handles concurrent updates. If we exhausted the
311 * scan, there is no need to do an update. 326 * scan, there is no need to do an update.
312 */ 327 */
313 do { 328 do {
314 nr = shrinker->nr; 329 nr = shrinker->nr;
315 new_nr = total_scan + nr; 330 new_nr = total_scan + nr;
316 if (total_scan <= 0) 331 if (total_scan <= 0)
317 break; 332 break;
318 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); 333 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
319 334
320 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); 335 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
321 } 336 }
322 up_read(&shrinker_rwsem); 337 up_read(&shrinker_rwsem);
323 out: 338 out:
324 cond_resched(); 339 cond_resched();
325 return ret; 340 return ret;
326 } 341 }
327 342
328 static void set_reclaim_mode(int priority, struct scan_control *sc, 343 static void set_reclaim_mode(int priority, struct scan_control *sc,
329 bool sync) 344 bool sync)
330 { 345 {
331 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; 346 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
332 347
333 /* 348 /*
334 * Initially assume we are entering either lumpy reclaim or 349 * Initially assume we are entering either lumpy reclaim or
335 * reclaim/compaction.Depending on the order, we will either set the 350 * reclaim/compaction.Depending on the order, we will either set the
336 * sync mode or just reclaim order-0 pages later. 351 * sync mode or just reclaim order-0 pages later.
337 */ 352 */
338 if (COMPACTION_BUILD) 353 if (COMPACTION_BUILD)
339 sc->reclaim_mode = RECLAIM_MODE_COMPACTION; 354 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
340 else 355 else
341 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; 356 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
342 357
343 /* 358 /*
344 * Avoid using lumpy reclaim or reclaim/compaction if possible by 359 * Avoid using lumpy reclaim or reclaim/compaction if possible by
345 * restricting when its set to either costly allocations or when 360 * restricting when its set to either costly allocations or when
346 * under memory pressure 361 * under memory pressure
347 */ 362 */
348 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 363 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
349 sc->reclaim_mode |= syncmode; 364 sc->reclaim_mode |= syncmode;
350 else if (sc->order && priority < DEF_PRIORITY - 2) 365 else if (sc->order && priority < DEF_PRIORITY - 2)
351 sc->reclaim_mode |= syncmode; 366 sc->reclaim_mode |= syncmode;
352 else 367 else
353 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 368 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
354 } 369 }
355 370
356 static void reset_reclaim_mode(struct scan_control *sc) 371 static void reset_reclaim_mode(struct scan_control *sc)
357 { 372 {
358 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 373 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
359 } 374 }
360 375
361 static inline int is_page_cache_freeable(struct page *page) 376 static inline int is_page_cache_freeable(struct page *page)
362 { 377 {
363 /* 378 /*
364 * A freeable page cache page is referenced only by the caller 379 * A freeable page cache page is referenced only by the caller
365 * that isolated the page, the page cache radix tree and 380 * that isolated the page, the page cache radix tree and
366 * optional buffer heads at page->private. 381 * optional buffer heads at page->private.
367 */ 382 */
368 return page_count(page) - page_has_private(page) == 2; 383 return page_count(page) - page_has_private(page) == 2;
369 } 384 }
370 385
371 static int may_write_to_queue(struct backing_dev_info *bdi, 386 static int may_write_to_queue(struct backing_dev_info *bdi,
372 struct scan_control *sc) 387 struct scan_control *sc)
373 { 388 {
374 if (current->flags & PF_SWAPWRITE) 389 if (current->flags & PF_SWAPWRITE)
375 return 1; 390 return 1;
376 if (!bdi_write_congested(bdi)) 391 if (!bdi_write_congested(bdi))
377 return 1; 392 return 1;
378 if (bdi == current->backing_dev_info) 393 if (bdi == current->backing_dev_info)
379 return 1; 394 return 1;
380 395
381 /* lumpy reclaim for hugepage often need a lot of write */ 396 /* lumpy reclaim for hugepage often need a lot of write */
382 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 397 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
383 return 1; 398 return 1;
384 return 0; 399 return 0;
385 } 400 }
386 401
387 /* 402 /*
388 * We detected a synchronous write error writing a page out. Probably 403 * We detected a synchronous write error writing a page out. Probably
389 * -ENOSPC. We need to propagate that into the address_space for a subsequent 404 * -ENOSPC. We need to propagate that into the address_space for a subsequent
390 * fsync(), msync() or close(). 405 * fsync(), msync() or close().
391 * 406 *
392 * The tricky part is that after writepage we cannot touch the mapping: nothing 407 * The tricky part is that after writepage we cannot touch the mapping: nothing
393 * prevents it from being freed up. But we have a ref on the page and once 408 * prevents it from being freed up. But we have a ref on the page and once
394 * that page is locked, the mapping is pinned. 409 * that page is locked, the mapping is pinned.
395 * 410 *
396 * We're allowed to run sleeping lock_page() here because we know the caller has 411 * We're allowed to run sleeping lock_page() here because we know the caller has
397 * __GFP_FS. 412 * __GFP_FS.
398 */ 413 */
399 static void handle_write_error(struct address_space *mapping, 414 static void handle_write_error(struct address_space *mapping,
400 struct page *page, int error) 415 struct page *page, int error)
401 { 416 {
402 lock_page(page); 417 lock_page(page);
403 if (page_mapping(page) == mapping) 418 if (page_mapping(page) == mapping)
404 mapping_set_error(mapping, error); 419 mapping_set_error(mapping, error);
405 unlock_page(page); 420 unlock_page(page);
406 } 421 }
407 422
408 /* possible outcome of pageout() */ 423 /* possible outcome of pageout() */
409 typedef enum { 424 typedef enum {
410 /* failed to write page out, page is locked */ 425 /* failed to write page out, page is locked */
411 PAGE_KEEP, 426 PAGE_KEEP,
412 /* move page to the active list, page is locked */ 427 /* move page to the active list, page is locked */
413 PAGE_ACTIVATE, 428 PAGE_ACTIVATE,
414 /* page has been sent to the disk successfully, page is unlocked */ 429 /* page has been sent to the disk successfully, page is unlocked */
415 PAGE_SUCCESS, 430 PAGE_SUCCESS,
416 /* page is clean and locked */ 431 /* page is clean and locked */
417 PAGE_CLEAN, 432 PAGE_CLEAN,
418 } pageout_t; 433 } pageout_t;
419 434
420 /* 435 /*
421 * pageout is called by shrink_page_list() for each dirty page. 436 * pageout is called by shrink_page_list() for each dirty page.
422 * Calls ->writepage(). 437 * Calls ->writepage().
423 */ 438 */
424 static pageout_t pageout(struct page *page, struct address_space *mapping, 439 static pageout_t pageout(struct page *page, struct address_space *mapping,
425 struct scan_control *sc) 440 struct scan_control *sc)
426 { 441 {
427 /* 442 /*
428 * If the page is dirty, only perform writeback if that write 443 * If the page is dirty, only perform writeback if that write
429 * will be non-blocking. To prevent this allocation from being 444 * will be non-blocking. To prevent this allocation from being
430 * stalled by pagecache activity. But note that there may be 445 * stalled by pagecache activity. But note that there may be
431 * stalls if we need to run get_block(). We could test 446 * stalls if we need to run get_block(). We could test
432 * PagePrivate for that. 447 * PagePrivate for that.
433 * 448 *
434 * If this process is currently in __generic_file_aio_write() against 449 * If this process is currently in __generic_file_aio_write() against
435 * this page's queue, we can perform writeback even if that 450 * this page's queue, we can perform writeback even if that
436 * will block. 451 * will block.
437 * 452 *
438 * If the page is swapcache, write it back even if that would 453 * If the page is swapcache, write it back even if that would
439 * block, for some throttling. This happens by accident, because 454 * block, for some throttling. This happens by accident, because
440 * swap_backing_dev_info is bust: it doesn't reflect the 455 * swap_backing_dev_info is bust: it doesn't reflect the
441 * congestion state of the swapdevs. Easy to fix, if needed. 456 * congestion state of the swapdevs. Easy to fix, if needed.
442 */ 457 */
443 if (!is_page_cache_freeable(page)) 458 if (!is_page_cache_freeable(page))
444 return PAGE_KEEP; 459 return PAGE_KEEP;
445 if (!mapping) { 460 if (!mapping) {
446 /* 461 /*
447 * Some data journaling orphaned pages can have 462 * Some data journaling orphaned pages can have
448 * page->mapping == NULL while being dirty with clean buffers. 463 * page->mapping == NULL while being dirty with clean buffers.
449 */ 464 */
450 if (page_has_private(page)) { 465 if (page_has_private(page)) {
451 if (try_to_free_buffers(page)) { 466 if (try_to_free_buffers(page)) {
452 ClearPageDirty(page); 467 ClearPageDirty(page);
453 printk("%s: orphaned page\n", __func__); 468 printk("%s: orphaned page\n", __func__);
454 return PAGE_CLEAN; 469 return PAGE_CLEAN;
455 } 470 }
456 } 471 }
457 return PAGE_KEEP; 472 return PAGE_KEEP;
458 } 473 }
459 if (mapping->a_ops->writepage == NULL) 474 if (mapping->a_ops->writepage == NULL)
460 return PAGE_ACTIVATE; 475 return PAGE_ACTIVATE;
461 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 476 if (!may_write_to_queue(mapping->backing_dev_info, sc))
462 return PAGE_KEEP; 477 return PAGE_KEEP;
463 478
464 if (clear_page_dirty_for_io(page)) { 479 if (clear_page_dirty_for_io(page)) {
465 int res; 480 int res;
466 struct writeback_control wbc = { 481 struct writeback_control wbc = {
467 .sync_mode = WB_SYNC_NONE, 482 .sync_mode = WB_SYNC_NONE,
468 .nr_to_write = SWAP_CLUSTER_MAX, 483 .nr_to_write = SWAP_CLUSTER_MAX,
469 .range_start = 0, 484 .range_start = 0,
470 .range_end = LLONG_MAX, 485 .range_end = LLONG_MAX,
471 .for_reclaim = 1, 486 .for_reclaim = 1,
472 }; 487 };
473 488
474 SetPageReclaim(page); 489 SetPageReclaim(page);
475 res = mapping->a_ops->writepage(page, &wbc); 490 res = mapping->a_ops->writepage(page, &wbc);
476 if (res < 0) 491 if (res < 0)
477 handle_write_error(mapping, page, res); 492 handle_write_error(mapping, page, res);
478 if (res == AOP_WRITEPAGE_ACTIVATE) { 493 if (res == AOP_WRITEPAGE_ACTIVATE) {
479 ClearPageReclaim(page); 494 ClearPageReclaim(page);
480 return PAGE_ACTIVATE; 495 return PAGE_ACTIVATE;
481 } 496 }
482 497
483 /* 498 /*
484 * Wait on writeback if requested to. This happens when 499 * Wait on writeback if requested to. This happens when
485 * direct reclaiming a large contiguous area and the 500 * direct reclaiming a large contiguous area and the
486 * first attempt to free a range of pages fails. 501 * first attempt to free a range of pages fails.
487 */ 502 */
488 if (PageWriteback(page) && 503 if (PageWriteback(page) &&
489 (sc->reclaim_mode & RECLAIM_MODE_SYNC)) 504 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
490 wait_on_page_writeback(page); 505 wait_on_page_writeback(page);
491 506
492 if (!PageWriteback(page)) { 507 if (!PageWriteback(page)) {
493 /* synchronous write or broken a_ops? */ 508 /* synchronous write or broken a_ops? */
494 ClearPageReclaim(page); 509 ClearPageReclaim(page);
495 } 510 }
496 trace_mm_vmscan_writepage(page, 511 trace_mm_vmscan_writepage(page,
497 trace_reclaim_flags(page, sc->reclaim_mode)); 512 trace_reclaim_flags(page, sc->reclaim_mode));
498 inc_zone_page_state(page, NR_VMSCAN_WRITE); 513 inc_zone_page_state(page, NR_VMSCAN_WRITE);
499 return PAGE_SUCCESS; 514 return PAGE_SUCCESS;
500 } 515 }
501 516
502 return PAGE_CLEAN; 517 return PAGE_CLEAN;
503 } 518 }
504 519
505 /* 520 /*
506 * Same as remove_mapping, but if the page is removed from the mapping, it 521 * Same as remove_mapping, but if the page is removed from the mapping, it
507 * gets returned with a refcount of 0. 522 * gets returned with a refcount of 0.
508 */ 523 */
509 static int __remove_mapping(struct address_space *mapping, struct page *page) 524 static int __remove_mapping(struct address_space *mapping, struct page *page)
510 { 525 {
511 BUG_ON(!PageLocked(page)); 526 BUG_ON(!PageLocked(page));
512 BUG_ON(mapping != page_mapping(page)); 527 BUG_ON(mapping != page_mapping(page));
513 528
514 spin_lock_irq(&mapping->tree_lock); 529 spin_lock_irq(&mapping->tree_lock);
515 /* 530 /*
516 * The non racy check for a busy page. 531 * The non racy check for a busy page.
517 * 532 *
518 * Must be careful with the order of the tests. When someone has 533 * Must be careful with the order of the tests. When someone has
519 * a ref to the page, it may be possible that they dirty it then 534 * a ref to the page, it may be possible that they dirty it then
520 * drop the reference. So if PageDirty is tested before page_count 535 * drop the reference. So if PageDirty is tested before page_count
521 * here, then the following race may occur: 536 * here, then the following race may occur:
522 * 537 *
523 * get_user_pages(&page); 538 * get_user_pages(&page);
524 * [user mapping goes away] 539 * [user mapping goes away]
525 * write_to(page); 540 * write_to(page);
526 * !PageDirty(page) [good] 541 * !PageDirty(page) [good]
527 * SetPageDirty(page); 542 * SetPageDirty(page);
528 * put_page(page); 543 * put_page(page);
529 * !page_count(page) [good, discard it] 544 * !page_count(page) [good, discard it]
530 * 545 *
531 * [oops, our write_to data is lost] 546 * [oops, our write_to data is lost]
532 * 547 *
533 * Reversing the order of the tests ensures such a situation cannot 548 * Reversing the order of the tests ensures such a situation cannot
534 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 549 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
535 * load is not satisfied before that of page->_count. 550 * load is not satisfied before that of page->_count.
536 * 551 *
537 * Note that if SetPageDirty is always performed via set_page_dirty, 552 * Note that if SetPageDirty is always performed via set_page_dirty,
538 * and thus under tree_lock, then this ordering is not required. 553 * and thus under tree_lock, then this ordering is not required.
539 */ 554 */
540 if (!page_freeze_refs(page, 2)) 555 if (!page_freeze_refs(page, 2))
541 goto cannot_free; 556 goto cannot_free;
542 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 557 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
543 if (unlikely(PageDirty(page))) { 558 if (unlikely(PageDirty(page))) {
544 page_unfreeze_refs(page, 2); 559 page_unfreeze_refs(page, 2);
545 goto cannot_free; 560 goto cannot_free;
546 } 561 }
547 562
548 if (PageSwapCache(page)) { 563 if (PageSwapCache(page)) {
549 swp_entry_t swap = { .val = page_private(page) }; 564 swp_entry_t swap = { .val = page_private(page) };
550 __delete_from_swap_cache(page); 565 __delete_from_swap_cache(page);
551 spin_unlock_irq(&mapping->tree_lock); 566 spin_unlock_irq(&mapping->tree_lock);
552 swapcache_free(swap, page); 567 swapcache_free(swap, page);
553 } else { 568 } else {
554 void (*freepage)(struct page *); 569 void (*freepage)(struct page *);
555 570
556 freepage = mapping->a_ops->freepage; 571 freepage = mapping->a_ops->freepage;
557 572
558 __delete_from_page_cache(page); 573 __delete_from_page_cache(page);
559 spin_unlock_irq(&mapping->tree_lock); 574 spin_unlock_irq(&mapping->tree_lock);
560 mem_cgroup_uncharge_cache_page(page); 575 mem_cgroup_uncharge_cache_page(page);
561 576
562 if (freepage != NULL) 577 if (freepage != NULL)
563 freepage(page); 578 freepage(page);
564 } 579 }
565 580
566 return 1; 581 return 1;
567 582
568 cannot_free: 583 cannot_free:
569 spin_unlock_irq(&mapping->tree_lock); 584 spin_unlock_irq(&mapping->tree_lock);
570 return 0; 585 return 0;
571 } 586 }
572 587
573 /* 588 /*
574 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 589 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
575 * someone else has a ref on the page, abort and return 0. If it was 590 * someone else has a ref on the page, abort and return 0. If it was
576 * successfully detached, return 1. Assumes the caller has a single ref on 591 * successfully detached, return 1. Assumes the caller has a single ref on
577 * this page. 592 * this page.
578 */ 593 */
579 int remove_mapping(struct address_space *mapping, struct page *page) 594 int remove_mapping(struct address_space *mapping, struct page *page)
580 { 595 {
581 if (__remove_mapping(mapping, page)) { 596 if (__remove_mapping(mapping, page)) {
582 /* 597 /*
583 * Unfreezing the refcount with 1 rather than 2 effectively 598 * Unfreezing the refcount with 1 rather than 2 effectively
584 * drops the pagecache ref for us without requiring another 599 * drops the pagecache ref for us without requiring another
585 * atomic operation. 600 * atomic operation.
586 */ 601 */
587 page_unfreeze_refs(page, 1); 602 page_unfreeze_refs(page, 1);
588 return 1; 603 return 1;
589 } 604 }
590 return 0; 605 return 0;
591 } 606 }
592 607
593 /** 608 /**
594 * putback_lru_page - put previously isolated page onto appropriate LRU list 609 * putback_lru_page - put previously isolated page onto appropriate LRU list
595 * @page: page to be put back to appropriate lru list 610 * @page: page to be put back to appropriate lru list
596 * 611 *
597 * Add previously isolated @page to appropriate LRU list. 612 * Add previously isolated @page to appropriate LRU list.
598 * Page may still be unevictable for other reasons. 613 * Page may still be unevictable for other reasons.
599 * 614 *
600 * lru_lock must not be held, interrupts must be enabled. 615 * lru_lock must not be held, interrupts must be enabled.
601 */ 616 */
602 void putback_lru_page(struct page *page) 617 void putback_lru_page(struct page *page)
603 { 618 {
604 int lru; 619 int lru;
605 int active = !!TestClearPageActive(page); 620 int active = !!TestClearPageActive(page);
606 int was_unevictable = PageUnevictable(page); 621 int was_unevictable = PageUnevictable(page);
607 622
608 VM_BUG_ON(PageLRU(page)); 623 VM_BUG_ON(PageLRU(page));
609 624
610 redo: 625 redo:
611 ClearPageUnevictable(page); 626 ClearPageUnevictable(page);
612 627
613 if (page_evictable(page, NULL)) { 628 if (page_evictable(page, NULL)) {
614 /* 629 /*
615 * For evictable pages, we can use the cache. 630 * For evictable pages, we can use the cache.
616 * In event of a race, worst case is we end up with an 631 * In event of a race, worst case is we end up with an
617 * unevictable page on [in]active list. 632 * unevictable page on [in]active list.
618 * We know how to handle that. 633 * We know how to handle that.
619 */ 634 */
620 lru = active + page_lru_base_type(page); 635 lru = active + page_lru_base_type(page);
621 lru_cache_add_lru(page, lru); 636 lru_cache_add_lru(page, lru);
622 } else { 637 } else {
623 /* 638 /*
624 * Put unevictable pages directly on zone's unevictable 639 * Put unevictable pages directly on zone's unevictable
625 * list. 640 * list.
626 */ 641 */
627 lru = LRU_UNEVICTABLE; 642 lru = LRU_UNEVICTABLE;
628 add_page_to_unevictable_list(page); 643 add_page_to_unevictable_list(page);
629 /* 644 /*
630 * When racing with an mlock clearing (page is 645 * When racing with an mlock clearing (page is
631 * unlocked), make sure that if the other thread does 646 * unlocked), make sure that if the other thread does
632 * not observe our setting of PG_lru and fails 647 * not observe our setting of PG_lru and fails
633 * isolation, we see PG_mlocked cleared below and move 648 * isolation, we see PG_mlocked cleared below and move
634 * the page back to the evictable list. 649 * the page back to the evictable list.
635 * 650 *
636 * The other side is TestClearPageMlocked(). 651 * The other side is TestClearPageMlocked().
637 */ 652 */
638 smp_mb(); 653 smp_mb();
639 } 654 }
640 655
641 /* 656 /*
642 * page's status can change while we move it among lru. If an evictable 657 * page's status can change while we move it among lru. If an evictable
643 * page is on unevictable list, it never be freed. To avoid that, 658 * page is on unevictable list, it never be freed. To avoid that,
644 * check after we added it to the list, again. 659 * check after we added it to the list, again.
645 */ 660 */
646 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 661 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
647 if (!isolate_lru_page(page)) { 662 if (!isolate_lru_page(page)) {
648 put_page(page); 663 put_page(page);
649 goto redo; 664 goto redo;
650 } 665 }
651 /* This means someone else dropped this page from LRU 666 /* This means someone else dropped this page from LRU
652 * So, it will be freed or putback to LRU again. There is 667 * So, it will be freed or putback to LRU again. There is
653 * nothing to do here. 668 * nothing to do here.
654 */ 669 */
655 } 670 }
656 671
657 if (was_unevictable && lru != LRU_UNEVICTABLE) 672 if (was_unevictable && lru != LRU_UNEVICTABLE)
658 count_vm_event(UNEVICTABLE_PGRESCUED); 673 count_vm_event(UNEVICTABLE_PGRESCUED);
659 else if (!was_unevictable && lru == LRU_UNEVICTABLE) 674 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
660 count_vm_event(UNEVICTABLE_PGCULLED); 675 count_vm_event(UNEVICTABLE_PGCULLED);
661 676
662 put_page(page); /* drop ref from isolate */ 677 put_page(page); /* drop ref from isolate */
663 } 678 }
664 679
665 enum page_references { 680 enum page_references {
666 PAGEREF_RECLAIM, 681 PAGEREF_RECLAIM,
667 PAGEREF_RECLAIM_CLEAN, 682 PAGEREF_RECLAIM_CLEAN,
668 PAGEREF_KEEP, 683 PAGEREF_KEEP,
669 PAGEREF_ACTIVATE, 684 PAGEREF_ACTIVATE,
670 }; 685 };
671 686
672 static enum page_references page_check_references(struct page *page, 687 static enum page_references page_check_references(struct page *page,
673 struct scan_control *sc) 688 struct scan_control *sc)
674 { 689 {
675 int referenced_ptes, referenced_page; 690 int referenced_ptes, referenced_page;
676 unsigned long vm_flags; 691 unsigned long vm_flags;
677 692
678 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); 693 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
679 referenced_page = TestClearPageReferenced(page); 694 referenced_page = TestClearPageReferenced(page);
680 695
681 /* Lumpy reclaim - ignore references */ 696 /* Lumpy reclaim - ignore references */
682 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 697 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
683 return PAGEREF_RECLAIM; 698 return PAGEREF_RECLAIM;
684 699
685 /* 700 /*
686 * Mlock lost the isolation race with us. Let try_to_unmap() 701 * Mlock lost the isolation race with us. Let try_to_unmap()
687 * move the page to the unevictable list. 702 * move the page to the unevictable list.
688 */ 703 */
689 if (vm_flags & VM_LOCKED) 704 if (vm_flags & VM_LOCKED)
690 return PAGEREF_RECLAIM; 705 return PAGEREF_RECLAIM;
691 706
692 if (referenced_ptes) { 707 if (referenced_ptes) {
693 if (PageAnon(page)) 708 if (PageAnon(page))
694 return PAGEREF_ACTIVATE; 709 return PAGEREF_ACTIVATE;
695 /* 710 /*
696 * All mapped pages start out with page table 711 * All mapped pages start out with page table
697 * references from the instantiating fault, so we need 712 * references from the instantiating fault, so we need
698 * to look twice if a mapped file page is used more 713 * to look twice if a mapped file page is used more
699 * than once. 714 * than once.
700 * 715 *
701 * Mark it and spare it for another trip around the 716 * Mark it and spare it for another trip around the
702 * inactive list. Another page table reference will 717 * inactive list. Another page table reference will
703 * lead to its activation. 718 * lead to its activation.
704 * 719 *
705 * Note: the mark is set for activated pages as well 720 * Note: the mark is set for activated pages as well
706 * so that recently deactivated but used pages are 721 * so that recently deactivated but used pages are
707 * quickly recovered. 722 * quickly recovered.
708 */ 723 */
709 SetPageReferenced(page); 724 SetPageReferenced(page);
710 725
711 if (referenced_page) 726 if (referenced_page)
712 return PAGEREF_ACTIVATE; 727 return PAGEREF_ACTIVATE;
713 728
714 return PAGEREF_KEEP; 729 return PAGEREF_KEEP;
715 } 730 }
716 731
717 /* Reclaim if clean, defer dirty pages to writeback */ 732 /* Reclaim if clean, defer dirty pages to writeback */
718 if (referenced_page && !PageSwapBacked(page)) 733 if (referenced_page && !PageSwapBacked(page))
719 return PAGEREF_RECLAIM_CLEAN; 734 return PAGEREF_RECLAIM_CLEAN;
720 735
721 return PAGEREF_RECLAIM; 736 return PAGEREF_RECLAIM;
722 } 737 }
723 738
724 static noinline_for_stack void free_page_list(struct list_head *free_pages) 739 static noinline_for_stack void free_page_list(struct list_head *free_pages)
725 { 740 {
726 struct pagevec freed_pvec; 741 struct pagevec freed_pvec;
727 struct page *page, *tmp; 742 struct page *page, *tmp;
728 743
729 pagevec_init(&freed_pvec, 1); 744 pagevec_init(&freed_pvec, 1);
730 745
731 list_for_each_entry_safe(page, tmp, free_pages, lru) { 746 list_for_each_entry_safe(page, tmp, free_pages, lru) {
732 list_del(&page->lru); 747 list_del(&page->lru);
733 if (!pagevec_add(&freed_pvec, page)) { 748 if (!pagevec_add(&freed_pvec, page)) {
734 __pagevec_free(&freed_pvec); 749 __pagevec_free(&freed_pvec);
735 pagevec_reinit(&freed_pvec); 750 pagevec_reinit(&freed_pvec);
736 } 751 }
737 } 752 }
738 753
739 pagevec_free(&freed_pvec); 754 pagevec_free(&freed_pvec);
740 } 755 }
741 756
742 /* 757 /*
743 * shrink_page_list() returns the number of reclaimed pages 758 * shrink_page_list() returns the number of reclaimed pages
744 */ 759 */
745 static unsigned long shrink_page_list(struct list_head *page_list, 760 static unsigned long shrink_page_list(struct list_head *page_list,
746 struct zone *zone, 761 struct zone *zone,
747 struct scan_control *sc) 762 struct scan_control *sc)
748 { 763 {
749 LIST_HEAD(ret_pages); 764 LIST_HEAD(ret_pages);
750 LIST_HEAD(free_pages); 765 LIST_HEAD(free_pages);
751 int pgactivate = 0; 766 int pgactivate = 0;
752 unsigned long nr_dirty = 0; 767 unsigned long nr_dirty = 0;
753 unsigned long nr_congested = 0; 768 unsigned long nr_congested = 0;
754 unsigned long nr_reclaimed = 0; 769 unsigned long nr_reclaimed = 0;
755 770
756 cond_resched(); 771 cond_resched();
757 772
758 while (!list_empty(page_list)) { 773 while (!list_empty(page_list)) {
759 enum page_references references; 774 enum page_references references;
760 struct address_space *mapping; 775 struct address_space *mapping;
761 struct page *page; 776 struct page *page;
762 int may_enter_fs; 777 int may_enter_fs;
763 778
764 cond_resched(); 779 cond_resched();
765 780
766 page = lru_to_page(page_list); 781 page = lru_to_page(page_list);
767 list_del(&page->lru); 782 list_del(&page->lru);
768 783
769 if (!trylock_page(page)) 784 if (!trylock_page(page))
770 goto keep; 785 goto keep;
771 786
772 VM_BUG_ON(PageActive(page)); 787 VM_BUG_ON(PageActive(page));
773 VM_BUG_ON(page_zone(page) != zone); 788 VM_BUG_ON(page_zone(page) != zone);
774 789
775 sc->nr_scanned++; 790 sc->nr_scanned++;
776 791
777 if (unlikely(!page_evictable(page, NULL))) 792 if (unlikely(!page_evictable(page, NULL)))
778 goto cull_mlocked; 793 goto cull_mlocked;
779 794
780 if (!sc->may_unmap && page_mapped(page)) 795 if (!sc->may_unmap && page_mapped(page))
781 goto keep_locked; 796 goto keep_locked;
782 797
783 /* Double the slab pressure for mapped and swapcache pages */ 798 /* Double the slab pressure for mapped and swapcache pages */
784 if (page_mapped(page) || PageSwapCache(page)) 799 if (page_mapped(page) || PageSwapCache(page))
785 sc->nr_scanned++; 800 sc->nr_scanned++;
786 801
787 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 802 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
788 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 803 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
789 804
790 if (PageWriteback(page)) { 805 if (PageWriteback(page)) {
791 /* 806 /*
792 * Synchronous reclaim is performed in two passes, 807 * Synchronous reclaim is performed in two passes,
793 * first an asynchronous pass over the list to 808 * first an asynchronous pass over the list to
794 * start parallel writeback, and a second synchronous 809 * start parallel writeback, and a second synchronous
795 * pass to wait for the IO to complete. Wait here 810 * pass to wait for the IO to complete. Wait here
796 * for any page for which writeback has already 811 * for any page for which writeback has already
797 * started. 812 * started.
798 */ 813 */
799 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 814 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
800 may_enter_fs) 815 may_enter_fs)
801 wait_on_page_writeback(page); 816 wait_on_page_writeback(page);
802 else { 817 else {
803 unlock_page(page); 818 unlock_page(page);
804 goto keep_lumpy; 819 goto keep_lumpy;
805 } 820 }
806 } 821 }
807 822
808 references = page_check_references(page, sc); 823 references = page_check_references(page, sc);
809 switch (references) { 824 switch (references) {
810 case PAGEREF_ACTIVATE: 825 case PAGEREF_ACTIVATE:
811 goto activate_locked; 826 goto activate_locked;
812 case PAGEREF_KEEP: 827 case PAGEREF_KEEP:
813 goto keep_locked; 828 goto keep_locked;
814 case PAGEREF_RECLAIM: 829 case PAGEREF_RECLAIM:
815 case PAGEREF_RECLAIM_CLEAN: 830 case PAGEREF_RECLAIM_CLEAN:
816 ; /* try to reclaim the page below */ 831 ; /* try to reclaim the page below */
817 } 832 }
818 833
819 /* 834 /*
820 * Anonymous process memory has backing store? 835 * Anonymous process memory has backing store?
821 * Try to allocate it some swap space here. 836 * Try to allocate it some swap space here.
822 */ 837 */
823 if (PageAnon(page) && !PageSwapCache(page)) { 838 if (PageAnon(page) && !PageSwapCache(page)) {
824 if (!(sc->gfp_mask & __GFP_IO)) 839 if (!(sc->gfp_mask & __GFP_IO))
825 goto keep_locked; 840 goto keep_locked;
826 if (!add_to_swap(page)) 841 if (!add_to_swap(page))
827 goto activate_locked; 842 goto activate_locked;
828 may_enter_fs = 1; 843 may_enter_fs = 1;
829 } 844 }
830 845
831 mapping = page_mapping(page); 846 mapping = page_mapping(page);
832 847
833 /* 848 /*
834 * The page is mapped into the page tables of one or more 849 * The page is mapped into the page tables of one or more
835 * processes. Try to unmap it here. 850 * processes. Try to unmap it here.
836 */ 851 */
837 if (page_mapped(page) && mapping) { 852 if (page_mapped(page) && mapping) {
838 switch (try_to_unmap(page, TTU_UNMAP)) { 853 switch (try_to_unmap(page, TTU_UNMAP)) {
839 case SWAP_FAIL: 854 case SWAP_FAIL:
840 goto activate_locked; 855 goto activate_locked;
841 case SWAP_AGAIN: 856 case SWAP_AGAIN:
842 goto keep_locked; 857 goto keep_locked;
843 case SWAP_MLOCK: 858 case SWAP_MLOCK:
844 goto cull_mlocked; 859 goto cull_mlocked;
845 case SWAP_SUCCESS: 860 case SWAP_SUCCESS:
846 ; /* try to free the page below */ 861 ; /* try to free the page below */
847 } 862 }
848 } 863 }
849 864
850 if (PageDirty(page)) { 865 if (PageDirty(page)) {
851 nr_dirty++; 866 nr_dirty++;
852 867
853 if (references == PAGEREF_RECLAIM_CLEAN) 868 if (references == PAGEREF_RECLAIM_CLEAN)
854 goto keep_locked; 869 goto keep_locked;
855 if (!may_enter_fs) 870 if (!may_enter_fs)
856 goto keep_locked; 871 goto keep_locked;
857 if (!sc->may_writepage) 872 if (!sc->may_writepage)
858 goto keep_locked; 873 goto keep_locked;
859 874
860 /* Page is dirty, try to write it out here */ 875 /* Page is dirty, try to write it out here */
861 switch (pageout(page, mapping, sc)) { 876 switch (pageout(page, mapping, sc)) {
862 case PAGE_KEEP: 877 case PAGE_KEEP:
863 nr_congested++; 878 nr_congested++;
864 goto keep_locked; 879 goto keep_locked;
865 case PAGE_ACTIVATE: 880 case PAGE_ACTIVATE:
866 goto activate_locked; 881 goto activate_locked;
867 case PAGE_SUCCESS: 882 case PAGE_SUCCESS:
868 if (PageWriteback(page)) 883 if (PageWriteback(page))
869 goto keep_lumpy; 884 goto keep_lumpy;
870 if (PageDirty(page)) 885 if (PageDirty(page))
871 goto keep; 886 goto keep;
872 887
873 /* 888 /*
874 * A synchronous write - probably a ramdisk. Go 889 * A synchronous write - probably a ramdisk. Go
875 * ahead and try to reclaim the page. 890 * ahead and try to reclaim the page.
876 */ 891 */
877 if (!trylock_page(page)) 892 if (!trylock_page(page))
878 goto keep; 893 goto keep;
879 if (PageDirty(page) || PageWriteback(page)) 894 if (PageDirty(page) || PageWriteback(page))
880 goto keep_locked; 895 goto keep_locked;
881 mapping = page_mapping(page); 896 mapping = page_mapping(page);
882 case PAGE_CLEAN: 897 case PAGE_CLEAN:
883 ; /* try to free the page below */ 898 ; /* try to free the page below */
884 } 899 }
885 } 900 }
886 901
887 /* 902 /*
888 * If the page has buffers, try to free the buffer mappings 903 * If the page has buffers, try to free the buffer mappings
889 * associated with this page. If we succeed we try to free 904 * associated with this page. If we succeed we try to free
890 * the page as well. 905 * the page as well.
891 * 906 *
892 * We do this even if the page is PageDirty(). 907 * We do this even if the page is PageDirty().
893 * try_to_release_page() does not perform I/O, but it is 908 * try_to_release_page() does not perform I/O, but it is
894 * possible for a page to have PageDirty set, but it is actually 909 * possible for a page to have PageDirty set, but it is actually
895 * clean (all its buffers are clean). This happens if the 910 * clean (all its buffers are clean). This happens if the
896 * buffers were written out directly, with submit_bh(). ext3 911 * buffers were written out directly, with submit_bh(). ext3
897 * will do this, as well as the blockdev mapping. 912 * will do this, as well as the blockdev mapping.
898 * try_to_release_page() will discover that cleanness and will 913 * try_to_release_page() will discover that cleanness and will
899 * drop the buffers and mark the page clean - it can be freed. 914 * drop the buffers and mark the page clean - it can be freed.
900 * 915 *
901 * Rarely, pages can have buffers and no ->mapping. These are 916 * Rarely, pages can have buffers and no ->mapping. These are
902 * the pages which were not successfully invalidated in 917 * the pages which were not successfully invalidated in
903 * truncate_complete_page(). We try to drop those buffers here 918 * truncate_complete_page(). We try to drop those buffers here
904 * and if that worked, and the page is no longer mapped into 919 * and if that worked, and the page is no longer mapped into
905 * process address space (page_count == 1) it can be freed. 920 * process address space (page_count == 1) it can be freed.
906 * Otherwise, leave the page on the LRU so it is swappable. 921 * Otherwise, leave the page on the LRU so it is swappable.
907 */ 922 */
908 if (page_has_private(page)) { 923 if (page_has_private(page)) {
909 if (!try_to_release_page(page, sc->gfp_mask)) 924 if (!try_to_release_page(page, sc->gfp_mask))
910 goto activate_locked; 925 goto activate_locked;
911 if (!mapping && page_count(page) == 1) { 926 if (!mapping && page_count(page) == 1) {
912 unlock_page(page); 927 unlock_page(page);
913 if (put_page_testzero(page)) 928 if (put_page_testzero(page))
914 goto free_it; 929 goto free_it;
915 else { 930 else {
916 /* 931 /*
917 * rare race with speculative reference. 932 * rare race with speculative reference.
918 * the speculative reference will free 933 * the speculative reference will free
919 * this page shortly, so we may 934 * this page shortly, so we may
920 * increment nr_reclaimed here (and 935 * increment nr_reclaimed here (and
921 * leave it off the LRU). 936 * leave it off the LRU).
922 */ 937 */
923 nr_reclaimed++; 938 nr_reclaimed++;
924 continue; 939 continue;
925 } 940 }
926 } 941 }
927 } 942 }
928 943
929 if (!mapping || !__remove_mapping(mapping, page)) 944 if (!mapping || !__remove_mapping(mapping, page))
930 goto keep_locked; 945 goto keep_locked;
931 946
932 /* 947 /*
933 * At this point, we have no other references and there is 948 * At this point, we have no other references and there is
934 * no way to pick any more up (removed from LRU, removed 949 * no way to pick any more up (removed from LRU, removed
935 * from pagecache). Can use non-atomic bitops now (and 950 * from pagecache). Can use non-atomic bitops now (and
936 * we obviously don't have to worry about waking up a process 951 * we obviously don't have to worry about waking up a process
937 * waiting on the page lock, because there are no references. 952 * waiting on the page lock, because there are no references.
938 */ 953 */
939 __clear_page_locked(page); 954 __clear_page_locked(page);
940 free_it: 955 free_it:
941 nr_reclaimed++; 956 nr_reclaimed++;
942 957
943 /* 958 /*
944 * Is there need to periodically free_page_list? It would 959 * Is there need to periodically free_page_list? It would
945 * appear not as the counts should be low 960 * appear not as the counts should be low
946 */ 961 */
947 list_add(&page->lru, &free_pages); 962 list_add(&page->lru, &free_pages);
948 continue; 963 continue;
949 964
950 cull_mlocked: 965 cull_mlocked:
951 if (PageSwapCache(page)) 966 if (PageSwapCache(page))
952 try_to_free_swap(page); 967 try_to_free_swap(page);
953 unlock_page(page); 968 unlock_page(page);
954 putback_lru_page(page); 969 putback_lru_page(page);
955 reset_reclaim_mode(sc); 970 reset_reclaim_mode(sc);
956 continue; 971 continue;
957 972
958 activate_locked: 973 activate_locked:
959 /* Not a candidate for swapping, so reclaim swap space. */ 974 /* Not a candidate for swapping, so reclaim swap space. */
960 if (PageSwapCache(page) && vm_swap_full()) 975 if (PageSwapCache(page) && vm_swap_full())
961 try_to_free_swap(page); 976 try_to_free_swap(page);
962 VM_BUG_ON(PageActive(page)); 977 VM_BUG_ON(PageActive(page));
963 SetPageActive(page); 978 SetPageActive(page);
964 pgactivate++; 979 pgactivate++;
965 keep_locked: 980 keep_locked:
966 unlock_page(page); 981 unlock_page(page);
967 keep: 982 keep:
968 reset_reclaim_mode(sc); 983 reset_reclaim_mode(sc);
969 keep_lumpy: 984 keep_lumpy:
970 list_add(&page->lru, &ret_pages); 985 list_add(&page->lru, &ret_pages);
971 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 986 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
972 } 987 }
973 988
974 /* 989 /*
975 * Tag a zone as congested if all the dirty pages encountered were 990 * Tag a zone as congested if all the dirty pages encountered were
976 * backed by a congested BDI. In this case, reclaimers should just 991 * backed by a congested BDI. In this case, reclaimers should just
977 * back off and wait for congestion to clear because further reclaim 992 * back off and wait for congestion to clear because further reclaim
978 * will encounter the same problem 993 * will encounter the same problem
979 */ 994 */
980 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 995 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
981 zone_set_flag(zone, ZONE_CONGESTED); 996 zone_set_flag(zone, ZONE_CONGESTED);
982 997
983 free_page_list(&free_pages); 998 free_page_list(&free_pages);
984 999
985 list_splice(&ret_pages, page_list); 1000 list_splice(&ret_pages, page_list);
986 count_vm_events(PGACTIVATE, pgactivate); 1001 count_vm_events(PGACTIVATE, pgactivate);
987 return nr_reclaimed; 1002 return nr_reclaimed;
988 } 1003 }
989 1004
990 /* 1005 /*
991 * Attempt to remove the specified page from its LRU. Only take this page 1006 * Attempt to remove the specified page from its LRU. Only take this page
992 * if it is of the appropriate PageActive status. Pages which are being 1007 * if it is of the appropriate PageActive status. Pages which are being
993 * freed elsewhere are also ignored. 1008 * freed elsewhere are also ignored.
994 * 1009 *
995 * page: page to consider 1010 * page: page to consider
996 * mode: one of the LRU isolation modes defined above 1011 * mode: one of the LRU isolation modes defined above
997 * 1012 *
998 * returns 0 on success, -ve errno on failure. 1013 * returns 0 on success, -ve errno on failure.
999 */ 1014 */
1000 int __isolate_lru_page(struct page *page, int mode, int file) 1015 int __isolate_lru_page(struct page *page, int mode, int file)
1001 { 1016 {
1002 int ret = -EINVAL; 1017 int ret = -EINVAL;
1003 1018
1004 /* Only take pages on the LRU. */ 1019 /* Only take pages on the LRU. */
1005 if (!PageLRU(page)) 1020 if (!PageLRU(page))
1006 return ret; 1021 return ret;
1007 1022
1008 /* 1023 /*
1009 * When checking the active state, we need to be sure we are 1024 * When checking the active state, we need to be sure we are
1010 * dealing with comparible boolean values. Take the logical not 1025 * dealing with comparible boolean values. Take the logical not
1011 * of each. 1026 * of each.
1012 */ 1027 */
1013 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1028 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
1014 return ret; 1029 return ret;
1015 1030
1016 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1031 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
1017 return ret; 1032 return ret;
1018 1033
1019 /* 1034 /*
1020 * When this function is being called for lumpy reclaim, we 1035 * When this function is being called for lumpy reclaim, we
1021 * initially look into all LRU pages, active, inactive and 1036 * initially look into all LRU pages, active, inactive and
1022 * unevictable; only give shrink_page_list evictable pages. 1037 * unevictable; only give shrink_page_list evictable pages.
1023 */ 1038 */
1024 if (PageUnevictable(page)) 1039 if (PageUnevictable(page))
1025 return ret; 1040 return ret;
1026 1041
1027 ret = -EBUSY; 1042 ret = -EBUSY;
1028 1043
1029 if (likely(get_page_unless_zero(page))) { 1044 if (likely(get_page_unless_zero(page))) {
1030 /* 1045 /*
1031 * Be careful not to clear PageLRU until after we're 1046 * Be careful not to clear PageLRU until after we're
1032 * sure the page is not being freed elsewhere -- the 1047 * sure the page is not being freed elsewhere -- the
1033 * page release code relies on it. 1048 * page release code relies on it.
1034 */ 1049 */
1035 ClearPageLRU(page); 1050 ClearPageLRU(page);
1036 ret = 0; 1051 ret = 0;
1037 } 1052 }
1038 1053
1039 return ret; 1054 return ret;
1040 } 1055 }
1041 1056
1042 /* 1057 /*
1043 * zone->lru_lock is heavily contended. Some of the functions that 1058 * zone->lru_lock is heavily contended. Some of the functions that
1044 * shrink the lists perform better by taking out a batch of pages 1059 * shrink the lists perform better by taking out a batch of pages
1045 * and working on them outside the LRU lock. 1060 * and working on them outside the LRU lock.
1046 * 1061 *
1047 * For pagecache intensive workloads, this function is the hottest 1062 * For pagecache intensive workloads, this function is the hottest
1048 * spot in the kernel (apart from copy_*_user functions). 1063 * spot in the kernel (apart from copy_*_user functions).
1049 * 1064 *
1050 * Appropriate locks must be held before calling this function. 1065 * Appropriate locks must be held before calling this function.
1051 * 1066 *
1052 * @nr_to_scan: The number of pages to look through on the list. 1067 * @nr_to_scan: The number of pages to look through on the list.
1053 * @src: The LRU list to pull pages off. 1068 * @src: The LRU list to pull pages off.
1054 * @dst: The temp list to put pages on to. 1069 * @dst: The temp list to put pages on to.
1055 * @scanned: The number of pages that were scanned. 1070 * @scanned: The number of pages that were scanned.
1056 * @order: The caller's attempted allocation order 1071 * @order: The caller's attempted allocation order
1057 * @mode: One of the LRU isolation modes 1072 * @mode: One of the LRU isolation modes
1058 * @file: True [1] if isolating file [!anon] pages 1073 * @file: True [1] if isolating file [!anon] pages
1059 * 1074 *
1060 * returns how many pages were moved onto *@dst. 1075 * returns how many pages were moved onto *@dst.
1061 */ 1076 */
1062 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1077 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1063 struct list_head *src, struct list_head *dst, 1078 struct list_head *src, struct list_head *dst,
1064 unsigned long *scanned, int order, int mode, int file) 1079 unsigned long *scanned, int order, int mode, int file)
1065 { 1080 {
1066 unsigned long nr_taken = 0; 1081 unsigned long nr_taken = 0;
1067 unsigned long nr_lumpy_taken = 0; 1082 unsigned long nr_lumpy_taken = 0;
1068 unsigned long nr_lumpy_dirty = 0; 1083 unsigned long nr_lumpy_dirty = 0;
1069 unsigned long nr_lumpy_failed = 0; 1084 unsigned long nr_lumpy_failed = 0;
1070 unsigned long scan; 1085 unsigned long scan;
1071 1086
1072 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1087 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1073 struct page *page; 1088 struct page *page;
1074 unsigned long pfn; 1089 unsigned long pfn;
1075 unsigned long end_pfn; 1090 unsigned long end_pfn;
1076 unsigned long page_pfn; 1091 unsigned long page_pfn;
1077 int zone_id; 1092 int zone_id;
1078 1093
1079 page = lru_to_page(src); 1094 page = lru_to_page(src);
1080 prefetchw_prev_lru_page(page, src, flags); 1095 prefetchw_prev_lru_page(page, src, flags);
1081 1096
1082 VM_BUG_ON(!PageLRU(page)); 1097 VM_BUG_ON(!PageLRU(page));
1083 1098
1084 switch (__isolate_lru_page(page, mode, file)) { 1099 switch (__isolate_lru_page(page, mode, file)) {
1085 case 0: 1100 case 0:
1086 list_move(&page->lru, dst); 1101 list_move(&page->lru, dst);
1087 mem_cgroup_del_lru(page); 1102 mem_cgroup_del_lru(page);
1088 nr_taken += hpage_nr_pages(page); 1103 nr_taken += hpage_nr_pages(page);
1089 break; 1104 break;
1090 1105
1091 case -EBUSY: 1106 case -EBUSY:
1092 /* else it is being freed elsewhere */ 1107 /* else it is being freed elsewhere */
1093 list_move(&page->lru, src); 1108 list_move(&page->lru, src);
1094 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1109 mem_cgroup_rotate_lru_list(page, page_lru(page));
1095 continue; 1110 continue;
1096 1111
1097 default: 1112 default:
1098 BUG(); 1113 BUG();
1099 } 1114 }
1100 1115
1101 if (!order) 1116 if (!order)
1102 continue; 1117 continue;
1103 1118
1104 /* 1119 /*
1105 * Attempt to take all pages in the order aligned region 1120 * Attempt to take all pages in the order aligned region
1106 * surrounding the tag page. Only take those pages of 1121 * surrounding the tag page. Only take those pages of
1107 * the same active state as that tag page. We may safely 1122 * the same active state as that tag page. We may safely
1108 * round the target page pfn down to the requested order 1123 * round the target page pfn down to the requested order
1109 * as the mem_map is guaranteed valid out to MAX_ORDER, 1124 * as the mem_map is guaranteed valid out to MAX_ORDER,
1110 * where that page is in a different zone we will detect 1125 * where that page is in a different zone we will detect
1111 * it from its zone id and abort this block scan. 1126 * it from its zone id and abort this block scan.
1112 */ 1127 */
1113 zone_id = page_zone_id(page); 1128 zone_id = page_zone_id(page);
1114 page_pfn = page_to_pfn(page); 1129 page_pfn = page_to_pfn(page);
1115 pfn = page_pfn & ~((1 << order) - 1); 1130 pfn = page_pfn & ~((1 << order) - 1);
1116 end_pfn = pfn + (1 << order); 1131 end_pfn = pfn + (1 << order);
1117 for (; pfn < end_pfn; pfn++) { 1132 for (; pfn < end_pfn; pfn++) {
1118 struct page *cursor_page; 1133 struct page *cursor_page;
1119 1134
1120 /* The target page is in the block, ignore it. */ 1135 /* The target page is in the block, ignore it. */
1121 if (unlikely(pfn == page_pfn)) 1136 if (unlikely(pfn == page_pfn))
1122 continue; 1137 continue;
1123 1138
1124 /* Avoid holes within the zone. */ 1139 /* Avoid holes within the zone. */
1125 if (unlikely(!pfn_valid_within(pfn))) 1140 if (unlikely(!pfn_valid_within(pfn)))
1126 break; 1141 break;
1127 1142
1128 cursor_page = pfn_to_page(pfn); 1143 cursor_page = pfn_to_page(pfn);
1129 1144
1130 /* Check that we have not crossed a zone boundary. */ 1145 /* Check that we have not crossed a zone boundary. */
1131 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1146 if (unlikely(page_zone_id(cursor_page) != zone_id))
1132 break; 1147 break;
1133 1148
1134 /* 1149 /*
1135 * If we don't have enough swap space, reclaiming of 1150 * If we don't have enough swap space, reclaiming of
1136 * anon page which don't already have a swap slot is 1151 * anon page which don't already have a swap slot is
1137 * pointless. 1152 * pointless.
1138 */ 1153 */
1139 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1154 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1140 !PageSwapCache(cursor_page)) 1155 !PageSwapCache(cursor_page))
1141 break; 1156 break;
1142 1157
1143 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1158 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1144 list_move(&cursor_page->lru, dst); 1159 list_move(&cursor_page->lru, dst);
1145 mem_cgroup_del_lru(cursor_page); 1160 mem_cgroup_del_lru(cursor_page);
1146 nr_taken += hpage_nr_pages(page); 1161 nr_taken += hpage_nr_pages(page);
1147 nr_lumpy_taken++; 1162 nr_lumpy_taken++;
1148 if (PageDirty(cursor_page)) 1163 if (PageDirty(cursor_page))
1149 nr_lumpy_dirty++; 1164 nr_lumpy_dirty++;
1150 scan++; 1165 scan++;
1151 } else { 1166 } else {
1152 /* 1167 /*
1153 * Check if the page is freed already. 1168 * Check if the page is freed already.
1154 * 1169 *
1155 * We can't use page_count() as that 1170 * We can't use page_count() as that
1156 * requires compound_head and we don't 1171 * requires compound_head and we don't
1157 * have a pin on the page here. If a 1172 * have a pin on the page here. If a
1158 * page is tail, we may or may not 1173 * page is tail, we may or may not
1159 * have isolated the head, so assume 1174 * have isolated the head, so assume
1160 * it's not free, it'd be tricky to 1175 * it's not free, it'd be tricky to
1161 * track the head status without a 1176 * track the head status without a
1162 * page pin. 1177 * page pin.
1163 */ 1178 */
1164 if (!PageTail(cursor_page) && 1179 if (!PageTail(cursor_page) &&
1165 !atomic_read(&cursor_page->_count)) 1180 !atomic_read(&cursor_page->_count))
1166 continue; 1181 continue;
1167 break; 1182 break;
1168 } 1183 }
1169 } 1184 }
1170 1185
1171 /* If we break out of the loop above, lumpy reclaim failed */ 1186 /* If we break out of the loop above, lumpy reclaim failed */
1172 if (pfn < end_pfn) 1187 if (pfn < end_pfn)
1173 nr_lumpy_failed++; 1188 nr_lumpy_failed++;
1174 } 1189 }
1175 1190
1176 *scanned = scan; 1191 *scanned = scan;
1177 1192
1178 trace_mm_vmscan_lru_isolate(order, 1193 trace_mm_vmscan_lru_isolate(order,
1179 nr_to_scan, scan, 1194 nr_to_scan, scan,
1180 nr_taken, 1195 nr_taken,
1181 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1196 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1182 mode); 1197 mode);
1183 return nr_taken; 1198 return nr_taken;
1184 } 1199 }
1185 1200
1186 static unsigned long isolate_pages_global(unsigned long nr, 1201 static unsigned long isolate_pages_global(unsigned long nr,
1187 struct list_head *dst, 1202 struct list_head *dst,
1188 unsigned long *scanned, int order, 1203 unsigned long *scanned, int order,
1189 int mode, struct zone *z, 1204 int mode, struct zone *z,
1190 int active, int file) 1205 int active, int file)
1191 { 1206 {
1192 int lru = LRU_BASE; 1207 int lru = LRU_BASE;
1193 if (active) 1208 if (active)
1194 lru += LRU_ACTIVE; 1209 lru += LRU_ACTIVE;
1195 if (file) 1210 if (file)
1196 lru += LRU_FILE; 1211 lru += LRU_FILE;
1197 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 1212 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1198 mode, file); 1213 mode, file);
1199 } 1214 }
1200 1215
1201 /* 1216 /*
1202 * clear_active_flags() is a helper for shrink_active_list(), clearing 1217 * clear_active_flags() is a helper for shrink_active_list(), clearing
1203 * any active bits from the pages in the list. 1218 * any active bits from the pages in the list.
1204 */ 1219 */
1205 static unsigned long clear_active_flags(struct list_head *page_list, 1220 static unsigned long clear_active_flags(struct list_head *page_list,
1206 unsigned int *count) 1221 unsigned int *count)
1207 { 1222 {
1208 int nr_active = 0; 1223 int nr_active = 0;
1209 int lru; 1224 int lru;
1210 struct page *page; 1225 struct page *page;
1211 1226
1212 list_for_each_entry(page, page_list, lru) { 1227 list_for_each_entry(page, page_list, lru) {
1213 int numpages = hpage_nr_pages(page); 1228 int numpages = hpage_nr_pages(page);
1214 lru = page_lru_base_type(page); 1229 lru = page_lru_base_type(page);
1215 if (PageActive(page)) { 1230 if (PageActive(page)) {
1216 lru += LRU_ACTIVE; 1231 lru += LRU_ACTIVE;
1217 ClearPageActive(page); 1232 ClearPageActive(page);
1218 nr_active += numpages; 1233 nr_active += numpages;
1219 } 1234 }
1220 if (count) 1235 if (count)
1221 count[lru] += numpages; 1236 count[lru] += numpages;
1222 } 1237 }
1223 1238
1224 return nr_active; 1239 return nr_active;
1225 } 1240 }
1226 1241
1227 /** 1242 /**
1228 * isolate_lru_page - tries to isolate a page from its LRU list 1243 * isolate_lru_page - tries to isolate a page from its LRU list
1229 * @page: page to isolate from its LRU list 1244 * @page: page to isolate from its LRU list
1230 * 1245 *
1231 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1246 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1232 * vmstat statistic corresponding to whatever LRU list the page was on. 1247 * vmstat statistic corresponding to whatever LRU list the page was on.
1233 * 1248 *
1234 * Returns 0 if the page was removed from an LRU list. 1249 * Returns 0 if the page was removed from an LRU list.
1235 * Returns -EBUSY if the page was not on an LRU list. 1250 * Returns -EBUSY if the page was not on an LRU list.
1236 * 1251 *
1237 * The returned page will have PageLRU() cleared. If it was found on 1252 * The returned page will have PageLRU() cleared. If it was found on
1238 * the active list, it will have PageActive set. If it was found on 1253 * the active list, it will have PageActive set. If it was found on
1239 * the unevictable list, it will have the PageUnevictable bit set. That flag 1254 * the unevictable list, it will have the PageUnevictable bit set. That flag
1240 * may need to be cleared by the caller before letting the page go. 1255 * may need to be cleared by the caller before letting the page go.
1241 * 1256 *
1242 * The vmstat statistic corresponding to the list on which the page was 1257 * The vmstat statistic corresponding to the list on which the page was
1243 * found will be decremented. 1258 * found will be decremented.
1244 * 1259 *
1245 * Restrictions: 1260 * Restrictions:
1246 * (1) Must be called with an elevated refcount on the page. This is a 1261 * (1) Must be called with an elevated refcount on the page. This is a
1247 * fundamentnal difference from isolate_lru_pages (which is called 1262 * fundamentnal difference from isolate_lru_pages (which is called
1248 * without a stable reference). 1263 * without a stable reference).
1249 * (2) the lru_lock must not be held. 1264 * (2) the lru_lock must not be held.
1250 * (3) interrupts must be enabled. 1265 * (3) interrupts must be enabled.
1251 */ 1266 */
1252 int isolate_lru_page(struct page *page) 1267 int isolate_lru_page(struct page *page)
1253 { 1268 {
1254 int ret = -EBUSY; 1269 int ret = -EBUSY;
1255 1270
1256 VM_BUG_ON(!page_count(page)); 1271 VM_BUG_ON(!page_count(page));
1257 1272
1258 if (PageLRU(page)) { 1273 if (PageLRU(page)) {
1259 struct zone *zone = page_zone(page); 1274 struct zone *zone = page_zone(page);
1260 1275
1261 spin_lock_irq(&zone->lru_lock); 1276 spin_lock_irq(&zone->lru_lock);
1262 if (PageLRU(page)) { 1277 if (PageLRU(page)) {
1263 int lru = page_lru(page); 1278 int lru = page_lru(page);
1264 ret = 0; 1279 ret = 0;
1265 get_page(page); 1280 get_page(page);
1266 ClearPageLRU(page); 1281 ClearPageLRU(page);
1267 1282
1268 del_page_from_lru_list(zone, page, lru); 1283 del_page_from_lru_list(zone, page, lru);
1269 } 1284 }
1270 spin_unlock_irq(&zone->lru_lock); 1285 spin_unlock_irq(&zone->lru_lock);
1271 } 1286 }
1272 return ret; 1287 return ret;
1273 } 1288 }
1274 1289
1275 /* 1290 /*
1276 * Are there way too many processes in the direct reclaim path already? 1291 * Are there way too many processes in the direct reclaim path already?
1277 */ 1292 */
1278 static int too_many_isolated(struct zone *zone, int file, 1293 static int too_many_isolated(struct zone *zone, int file,
1279 struct scan_control *sc) 1294 struct scan_control *sc)
1280 { 1295 {
1281 unsigned long inactive, isolated; 1296 unsigned long inactive, isolated;
1282 1297
1283 if (current_is_kswapd()) 1298 if (current_is_kswapd())
1284 return 0; 1299 return 0;
1285 1300
1286 if (!scanning_global_lru(sc)) 1301 if (!scanning_global_lru(sc))
1287 return 0; 1302 return 0;
1288 1303
1289 if (file) { 1304 if (file) {
1290 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1305 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1291 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1306 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1292 } else { 1307 } else {
1293 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1308 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1294 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1309 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1295 } 1310 }
1296 1311
1297 return isolated > inactive; 1312 return isolated > inactive;
1298 } 1313 }
1299 1314
1300 /* 1315 /*
1301 * TODO: Try merging with migrations version of putback_lru_pages 1316 * TODO: Try merging with migrations version of putback_lru_pages
1302 */ 1317 */
1303 static noinline_for_stack void 1318 static noinline_for_stack void
1304 putback_lru_pages(struct zone *zone, struct scan_control *sc, 1319 putback_lru_pages(struct zone *zone, struct scan_control *sc,
1305 unsigned long nr_anon, unsigned long nr_file, 1320 unsigned long nr_anon, unsigned long nr_file,
1306 struct list_head *page_list) 1321 struct list_head *page_list)
1307 { 1322 {
1308 struct page *page; 1323 struct page *page;
1309 struct pagevec pvec; 1324 struct pagevec pvec;
1310 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1325 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1311 1326
1312 pagevec_init(&pvec, 1); 1327 pagevec_init(&pvec, 1);
1313 1328
1314 /* 1329 /*
1315 * Put back any unfreeable pages. 1330 * Put back any unfreeable pages.
1316 */ 1331 */
1317 spin_lock(&zone->lru_lock); 1332 spin_lock(&zone->lru_lock);
1318 while (!list_empty(page_list)) { 1333 while (!list_empty(page_list)) {
1319 int lru; 1334 int lru;
1320 page = lru_to_page(page_list); 1335 page = lru_to_page(page_list);
1321 VM_BUG_ON(PageLRU(page)); 1336 VM_BUG_ON(PageLRU(page));
1322 list_del(&page->lru); 1337 list_del(&page->lru);
1323 if (unlikely(!page_evictable(page, NULL))) { 1338 if (unlikely(!page_evictable(page, NULL))) {
1324 spin_unlock_irq(&zone->lru_lock); 1339 spin_unlock_irq(&zone->lru_lock);
1325 putback_lru_page(page); 1340 putback_lru_page(page);
1326 spin_lock_irq(&zone->lru_lock); 1341 spin_lock_irq(&zone->lru_lock);
1327 continue; 1342 continue;
1328 } 1343 }
1329 SetPageLRU(page); 1344 SetPageLRU(page);
1330 lru = page_lru(page); 1345 lru = page_lru(page);
1331 add_page_to_lru_list(zone, page, lru); 1346 add_page_to_lru_list(zone, page, lru);
1332 if (is_active_lru(lru)) { 1347 if (is_active_lru(lru)) {
1333 int file = is_file_lru(lru); 1348 int file = is_file_lru(lru);
1334 int numpages = hpage_nr_pages(page); 1349 int numpages = hpage_nr_pages(page);
1335 reclaim_stat->recent_rotated[file] += numpages; 1350 reclaim_stat->recent_rotated[file] += numpages;
1336 } 1351 }
1337 if (!pagevec_add(&pvec, page)) { 1352 if (!pagevec_add(&pvec, page)) {
1338 spin_unlock_irq(&zone->lru_lock); 1353 spin_unlock_irq(&zone->lru_lock);
1339 __pagevec_release(&pvec); 1354 __pagevec_release(&pvec);
1340 spin_lock_irq(&zone->lru_lock); 1355 spin_lock_irq(&zone->lru_lock);
1341 } 1356 }
1342 } 1357 }
1343 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1358 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1344 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); 1359 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1345 1360
1346 spin_unlock_irq(&zone->lru_lock); 1361 spin_unlock_irq(&zone->lru_lock);
1347 pagevec_release(&pvec); 1362 pagevec_release(&pvec);
1348 } 1363 }
1349 1364
1350 static noinline_for_stack void update_isolated_counts(struct zone *zone, 1365 static noinline_for_stack void update_isolated_counts(struct zone *zone,
1351 struct scan_control *sc, 1366 struct scan_control *sc,
1352 unsigned long *nr_anon, 1367 unsigned long *nr_anon,
1353 unsigned long *nr_file, 1368 unsigned long *nr_file,
1354 struct list_head *isolated_list) 1369 struct list_head *isolated_list)
1355 { 1370 {
1356 unsigned long nr_active; 1371 unsigned long nr_active;
1357 unsigned int count[NR_LRU_LISTS] = { 0, }; 1372 unsigned int count[NR_LRU_LISTS] = { 0, };
1358 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1373 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1359 1374
1360 nr_active = clear_active_flags(isolated_list, count); 1375 nr_active = clear_active_flags(isolated_list, count);
1361 __count_vm_events(PGDEACTIVATE, nr_active); 1376 __count_vm_events(PGDEACTIVATE, nr_active);
1362 1377
1363 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1378 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1364 -count[LRU_ACTIVE_FILE]); 1379 -count[LRU_ACTIVE_FILE]);
1365 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1380 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1366 -count[LRU_INACTIVE_FILE]); 1381 -count[LRU_INACTIVE_FILE]);
1367 __mod_zone_page_state(zone, NR_ACTIVE_ANON, 1382 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1368 -count[LRU_ACTIVE_ANON]); 1383 -count[LRU_ACTIVE_ANON]);
1369 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1384 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1370 -count[LRU_INACTIVE_ANON]); 1385 -count[LRU_INACTIVE_ANON]);
1371 1386
1372 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1387 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1373 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1388 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1374 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); 1389 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1375 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); 1390 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1376 1391
1377 reclaim_stat->recent_scanned[0] += *nr_anon; 1392 reclaim_stat->recent_scanned[0] += *nr_anon;
1378 reclaim_stat->recent_scanned[1] += *nr_file; 1393 reclaim_stat->recent_scanned[1] += *nr_file;
1379 } 1394 }
1380 1395
1381 /* 1396 /*
1382 * Returns true if the caller should wait to clean dirty/writeback pages. 1397 * Returns true if the caller should wait to clean dirty/writeback pages.
1383 * 1398 *
1384 * If we are direct reclaiming for contiguous pages and we do not reclaim 1399 * If we are direct reclaiming for contiguous pages and we do not reclaim
1385 * everything in the list, try again and wait for writeback IO to complete. 1400 * everything in the list, try again and wait for writeback IO to complete.
1386 * This will stall high-order allocations noticeably. Only do that when really 1401 * This will stall high-order allocations noticeably. Only do that when really
1387 * need to free the pages under high memory pressure. 1402 * need to free the pages under high memory pressure.
1388 */ 1403 */
1389 static inline bool should_reclaim_stall(unsigned long nr_taken, 1404 static inline bool should_reclaim_stall(unsigned long nr_taken,
1390 unsigned long nr_freed, 1405 unsigned long nr_freed,
1391 int priority, 1406 int priority,
1392 struct scan_control *sc) 1407 struct scan_control *sc)
1393 { 1408 {
1394 int lumpy_stall_priority; 1409 int lumpy_stall_priority;
1395 1410
1396 /* kswapd should not stall on sync IO */ 1411 /* kswapd should not stall on sync IO */
1397 if (current_is_kswapd()) 1412 if (current_is_kswapd())
1398 return false; 1413 return false;
1399 1414
1400 /* Only stall on lumpy reclaim */ 1415 /* Only stall on lumpy reclaim */
1401 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1416 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1402 return false; 1417 return false;
1403 1418
1404 /* If we have relaimed everything on the isolated list, no stall */ 1419 /* If we have relaimed everything on the isolated list, no stall */
1405 if (nr_freed == nr_taken) 1420 if (nr_freed == nr_taken)
1406 return false; 1421 return false;
1407 1422
1408 /* 1423 /*
1409 * For high-order allocations, there are two stall thresholds. 1424 * For high-order allocations, there are two stall thresholds.
1410 * High-cost allocations stall immediately where as lower 1425 * High-cost allocations stall immediately where as lower
1411 * order allocations such as stacks require the scanning 1426 * order allocations such as stacks require the scanning
1412 * priority to be much higher before stalling. 1427 * priority to be much higher before stalling.
1413 */ 1428 */
1414 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1429 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1415 lumpy_stall_priority = DEF_PRIORITY; 1430 lumpy_stall_priority = DEF_PRIORITY;
1416 else 1431 else
1417 lumpy_stall_priority = DEF_PRIORITY / 3; 1432 lumpy_stall_priority = DEF_PRIORITY / 3;
1418 1433
1419 return priority <= lumpy_stall_priority; 1434 return priority <= lumpy_stall_priority;
1420 } 1435 }
1421 1436
1422 /* 1437 /*
1423 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1438 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1424 * of reclaimed pages 1439 * of reclaimed pages
1425 */ 1440 */
1426 static noinline_for_stack unsigned long 1441 static noinline_for_stack unsigned long
1427 shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, 1442 shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1428 struct scan_control *sc, int priority, int file) 1443 struct scan_control *sc, int priority, int file)
1429 { 1444 {
1430 LIST_HEAD(page_list); 1445 LIST_HEAD(page_list);
1431 unsigned long nr_scanned; 1446 unsigned long nr_scanned;
1432 unsigned long nr_reclaimed = 0; 1447 unsigned long nr_reclaimed = 0;
1433 unsigned long nr_taken; 1448 unsigned long nr_taken;
1434 unsigned long nr_anon; 1449 unsigned long nr_anon;
1435 unsigned long nr_file; 1450 unsigned long nr_file;
1436 1451
1437 while (unlikely(too_many_isolated(zone, file, sc))) { 1452 while (unlikely(too_many_isolated(zone, file, sc))) {
1438 congestion_wait(BLK_RW_ASYNC, HZ/10); 1453 congestion_wait(BLK_RW_ASYNC, HZ/10);
1439 1454
1440 /* We are about to die and free our memory. Return now. */ 1455 /* We are about to die and free our memory. Return now. */
1441 if (fatal_signal_pending(current)) 1456 if (fatal_signal_pending(current))
1442 return SWAP_CLUSTER_MAX; 1457 return SWAP_CLUSTER_MAX;
1443 } 1458 }
1444 1459
1445 set_reclaim_mode(priority, sc, false); 1460 set_reclaim_mode(priority, sc, false);
1446 lru_add_drain(); 1461 lru_add_drain();
1447 spin_lock_irq(&zone->lru_lock); 1462 spin_lock_irq(&zone->lru_lock);
1448 1463
1449 if (scanning_global_lru(sc)) { 1464 if (scanning_global_lru(sc)) {
1450 nr_taken = isolate_pages_global(nr_to_scan, 1465 nr_taken = isolate_pages_global(nr_to_scan,
1451 &page_list, &nr_scanned, sc->order, 1466 &page_list, &nr_scanned, sc->order,
1452 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1467 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1453 ISOLATE_BOTH : ISOLATE_INACTIVE, 1468 ISOLATE_BOTH : ISOLATE_INACTIVE,
1454 zone, 0, file); 1469 zone, 0, file);
1455 zone->pages_scanned += nr_scanned; 1470 zone->pages_scanned += nr_scanned;
1456 if (current_is_kswapd()) 1471 if (current_is_kswapd())
1457 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1472 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1458 nr_scanned); 1473 nr_scanned);
1459 else 1474 else
1460 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1475 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1461 nr_scanned); 1476 nr_scanned);
1462 } else { 1477 } else {
1463 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1478 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1464 &page_list, &nr_scanned, sc->order, 1479 &page_list, &nr_scanned, sc->order,
1465 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1480 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1466 ISOLATE_BOTH : ISOLATE_INACTIVE, 1481 ISOLATE_BOTH : ISOLATE_INACTIVE,
1467 zone, sc->mem_cgroup, 1482 zone, sc->mem_cgroup,
1468 0, file); 1483 0, file);
1469 /* 1484 /*
1470 * mem_cgroup_isolate_pages() keeps track of 1485 * mem_cgroup_isolate_pages() keeps track of
1471 * scanned pages on its own. 1486 * scanned pages on its own.
1472 */ 1487 */
1473 } 1488 }
1474 1489
1475 if (nr_taken == 0) { 1490 if (nr_taken == 0) {
1476 spin_unlock_irq(&zone->lru_lock); 1491 spin_unlock_irq(&zone->lru_lock);
1477 return 0; 1492 return 0;
1478 } 1493 }
1479 1494
1480 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); 1495 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1481 1496
1482 spin_unlock_irq(&zone->lru_lock); 1497 spin_unlock_irq(&zone->lru_lock);
1483 1498
1484 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1499 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1485 1500
1486 /* Check if we should syncronously wait for writeback */ 1501 /* Check if we should syncronously wait for writeback */
1487 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1502 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1488 set_reclaim_mode(priority, sc, true); 1503 set_reclaim_mode(priority, sc, true);
1489 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1504 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1490 } 1505 }
1491 1506
1492 local_irq_disable(); 1507 local_irq_disable();
1493 if (current_is_kswapd()) 1508 if (current_is_kswapd())
1494 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1509 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1495 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1510 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1496 1511
1497 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1512 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1498 1513
1499 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1514 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1500 zone_idx(zone), 1515 zone_idx(zone),
1501 nr_scanned, nr_reclaimed, 1516 nr_scanned, nr_reclaimed,
1502 priority, 1517 priority,
1503 trace_shrink_flags(file, sc->reclaim_mode)); 1518 trace_shrink_flags(file, sc->reclaim_mode));
1504 return nr_reclaimed; 1519 return nr_reclaimed;
1505 } 1520 }
1506 1521
1507 /* 1522 /*
1508 * This moves pages from the active list to the inactive list. 1523 * This moves pages from the active list to the inactive list.
1509 * 1524 *
1510 * We move them the other way if the page is referenced by one or more 1525 * We move them the other way if the page is referenced by one or more
1511 * processes, from rmap. 1526 * processes, from rmap.
1512 * 1527 *
1513 * If the pages are mostly unmapped, the processing is fast and it is 1528 * If the pages are mostly unmapped, the processing is fast and it is
1514 * appropriate to hold zone->lru_lock across the whole operation. But if 1529 * appropriate to hold zone->lru_lock across the whole operation. But if
1515 * the pages are mapped, the processing is slow (page_referenced()) so we 1530 * the pages are mapped, the processing is slow (page_referenced()) so we
1516 * should drop zone->lru_lock around each page. It's impossible to balance 1531 * should drop zone->lru_lock around each page. It's impossible to balance
1517 * this, so instead we remove the pages from the LRU while processing them. 1532 * this, so instead we remove the pages from the LRU while processing them.
1518 * It is safe to rely on PG_active against the non-LRU pages in here because 1533 * It is safe to rely on PG_active against the non-LRU pages in here because
1519 * nobody will play with that bit on a non-LRU page. 1534 * nobody will play with that bit on a non-LRU page.
1520 * 1535 *
1521 * The downside is that we have to touch page->_count against each page. 1536 * The downside is that we have to touch page->_count against each page.
1522 * But we had to alter page->flags anyway. 1537 * But we had to alter page->flags anyway.
1523 */ 1538 */
1524 1539
1525 static void move_active_pages_to_lru(struct zone *zone, 1540 static void move_active_pages_to_lru(struct zone *zone,
1526 struct list_head *list, 1541 struct list_head *list,
1527 enum lru_list lru) 1542 enum lru_list lru)
1528 { 1543 {
1529 unsigned long pgmoved = 0; 1544 unsigned long pgmoved = 0;
1530 struct pagevec pvec; 1545 struct pagevec pvec;
1531 struct page *page; 1546 struct page *page;
1532 1547
1533 pagevec_init(&pvec, 1); 1548 pagevec_init(&pvec, 1);
1534 1549
1535 while (!list_empty(list)) { 1550 while (!list_empty(list)) {
1536 page = lru_to_page(list); 1551 page = lru_to_page(list);
1537 1552
1538 VM_BUG_ON(PageLRU(page)); 1553 VM_BUG_ON(PageLRU(page));
1539 SetPageLRU(page); 1554 SetPageLRU(page);
1540 1555
1541 list_move(&page->lru, &zone->lru[lru].list); 1556 list_move(&page->lru, &zone->lru[lru].list);
1542 mem_cgroup_add_lru_list(page, lru); 1557 mem_cgroup_add_lru_list(page, lru);
1543 pgmoved += hpage_nr_pages(page); 1558 pgmoved += hpage_nr_pages(page);
1544 1559
1545 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1560 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1546 spin_unlock_irq(&zone->lru_lock); 1561 spin_unlock_irq(&zone->lru_lock);
1547 if (buffer_heads_over_limit) 1562 if (buffer_heads_over_limit)
1548 pagevec_strip(&pvec); 1563 pagevec_strip(&pvec);
1549 __pagevec_release(&pvec); 1564 __pagevec_release(&pvec);
1550 spin_lock_irq(&zone->lru_lock); 1565 spin_lock_irq(&zone->lru_lock);
1551 } 1566 }
1552 } 1567 }
1553 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1568 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1554 if (!is_active_lru(lru)) 1569 if (!is_active_lru(lru))
1555 __count_vm_events(PGDEACTIVATE, pgmoved); 1570 __count_vm_events(PGDEACTIVATE, pgmoved);
1556 } 1571 }
1557 1572
1558 static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1573 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1559 struct scan_control *sc, int priority, int file) 1574 struct scan_control *sc, int priority, int file)
1560 { 1575 {
1561 unsigned long nr_taken; 1576 unsigned long nr_taken;
1562 unsigned long pgscanned; 1577 unsigned long pgscanned;
1563 unsigned long vm_flags; 1578 unsigned long vm_flags;
1564 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1579 LIST_HEAD(l_hold); /* The pages which were snipped off */
1565 LIST_HEAD(l_active); 1580 LIST_HEAD(l_active);
1566 LIST_HEAD(l_inactive); 1581 LIST_HEAD(l_inactive);
1567 struct page *page; 1582 struct page *page;
1568 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1583 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1569 unsigned long nr_rotated = 0; 1584 unsigned long nr_rotated = 0;
1570 1585
1571 lru_add_drain(); 1586 lru_add_drain();
1572 spin_lock_irq(&zone->lru_lock); 1587 spin_lock_irq(&zone->lru_lock);
1573 if (scanning_global_lru(sc)) { 1588 if (scanning_global_lru(sc)) {
1574 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1589 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1575 &pgscanned, sc->order, 1590 &pgscanned, sc->order,
1576 ISOLATE_ACTIVE, zone, 1591 ISOLATE_ACTIVE, zone,
1577 1, file); 1592 1, file);
1578 zone->pages_scanned += pgscanned; 1593 zone->pages_scanned += pgscanned;
1579 } else { 1594 } else {
1580 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1595 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1581 &pgscanned, sc->order, 1596 &pgscanned, sc->order,
1582 ISOLATE_ACTIVE, zone, 1597 ISOLATE_ACTIVE, zone,
1583 sc->mem_cgroup, 1, file); 1598 sc->mem_cgroup, 1, file);
1584 /* 1599 /*
1585 * mem_cgroup_isolate_pages() keeps track of 1600 * mem_cgroup_isolate_pages() keeps track of
1586 * scanned pages on its own. 1601 * scanned pages on its own.
1587 */ 1602 */
1588 } 1603 }
1589 1604
1590 reclaim_stat->recent_scanned[file] += nr_taken; 1605 reclaim_stat->recent_scanned[file] += nr_taken;
1591 1606
1592 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1607 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1593 if (file) 1608 if (file)
1594 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); 1609 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1595 else 1610 else
1596 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); 1611 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1597 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1612 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1598 spin_unlock_irq(&zone->lru_lock); 1613 spin_unlock_irq(&zone->lru_lock);
1599 1614
1600 while (!list_empty(&l_hold)) { 1615 while (!list_empty(&l_hold)) {
1601 cond_resched(); 1616 cond_resched();
1602 page = lru_to_page(&l_hold); 1617 page = lru_to_page(&l_hold);
1603 list_del(&page->lru); 1618 list_del(&page->lru);
1604 1619
1605 if (unlikely(!page_evictable(page, NULL))) { 1620 if (unlikely(!page_evictable(page, NULL))) {
1606 putback_lru_page(page); 1621 putback_lru_page(page);
1607 continue; 1622 continue;
1608 } 1623 }
1609 1624
1610 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1625 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1611 nr_rotated += hpage_nr_pages(page); 1626 nr_rotated += hpage_nr_pages(page);
1612 /* 1627 /*
1613 * Identify referenced, file-backed active pages and 1628 * Identify referenced, file-backed active pages and
1614 * give them one more trip around the active list. So 1629 * give them one more trip around the active list. So
1615 * that executable code get better chances to stay in 1630 * that executable code get better chances to stay in
1616 * memory under moderate memory pressure. Anon pages 1631 * memory under moderate memory pressure. Anon pages
1617 * are not likely to be evicted by use-once streaming 1632 * are not likely to be evicted by use-once streaming
1618 * IO, plus JVM can create lots of anon VM_EXEC pages, 1633 * IO, plus JVM can create lots of anon VM_EXEC pages,
1619 * so we ignore them here. 1634 * so we ignore them here.
1620 */ 1635 */
1621 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1636 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1622 list_add(&page->lru, &l_active); 1637 list_add(&page->lru, &l_active);
1623 continue; 1638 continue;
1624 } 1639 }
1625 } 1640 }
1626 1641
1627 ClearPageActive(page); /* we are de-activating */ 1642 ClearPageActive(page); /* we are de-activating */
1628 list_add(&page->lru, &l_inactive); 1643 list_add(&page->lru, &l_inactive);
1629 } 1644 }
1630 1645
1631 /* 1646 /*
1632 * Move pages back to the lru list. 1647 * Move pages back to the lru list.
1633 */ 1648 */
1634 spin_lock_irq(&zone->lru_lock); 1649 spin_lock_irq(&zone->lru_lock);
1635 /* 1650 /*
1636 * Count referenced pages from currently used mappings as rotated, 1651 * Count referenced pages from currently used mappings as rotated,
1637 * even though only some of them are actually re-activated. This 1652 * even though only some of them are actually re-activated. This
1638 * helps balance scan pressure between file and anonymous pages in 1653 * helps balance scan pressure between file and anonymous pages in
1639 * get_scan_ratio. 1654 * get_scan_ratio.
1640 */ 1655 */
1641 reclaim_stat->recent_rotated[file] += nr_rotated; 1656 reclaim_stat->recent_rotated[file] += nr_rotated;
1642 1657
1643 move_active_pages_to_lru(zone, &l_active, 1658 move_active_pages_to_lru(zone, &l_active,
1644 LRU_ACTIVE + file * LRU_FILE); 1659 LRU_ACTIVE + file * LRU_FILE);
1645 move_active_pages_to_lru(zone, &l_inactive, 1660 move_active_pages_to_lru(zone, &l_inactive,
1646 LRU_BASE + file * LRU_FILE); 1661 LRU_BASE + file * LRU_FILE);
1647 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1662 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1648 spin_unlock_irq(&zone->lru_lock); 1663 spin_unlock_irq(&zone->lru_lock);
1649 } 1664 }
1650 1665
1651 #ifdef CONFIG_SWAP 1666 #ifdef CONFIG_SWAP
1652 static int inactive_anon_is_low_global(struct zone *zone) 1667 static int inactive_anon_is_low_global(struct zone *zone)
1653 { 1668 {
1654 unsigned long active, inactive; 1669 unsigned long active, inactive;
1655 1670
1656 active = zone_page_state(zone, NR_ACTIVE_ANON); 1671 active = zone_page_state(zone, NR_ACTIVE_ANON);
1657 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1672 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1658 1673
1659 if (inactive * zone->inactive_ratio < active) 1674 if (inactive * zone->inactive_ratio < active)
1660 return 1; 1675 return 1;
1661 1676
1662 return 0; 1677 return 0;
1663 } 1678 }
1664 1679
1665 /** 1680 /**
1666 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1681 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1667 * @zone: zone to check 1682 * @zone: zone to check
1668 * @sc: scan control of this context 1683 * @sc: scan control of this context
1669 * 1684 *
1670 * Returns true if the zone does not have enough inactive anon pages, 1685 * Returns true if the zone does not have enough inactive anon pages,
1671 * meaning some active anon pages need to be deactivated. 1686 * meaning some active anon pages need to be deactivated.
1672 */ 1687 */
1673 static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) 1688 static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1674 { 1689 {
1675 int low; 1690 int low;
1676 1691
1677 /* 1692 /*
1678 * If we don't have swap space, anonymous page deactivation 1693 * If we don't have swap space, anonymous page deactivation
1679 * is pointless. 1694 * is pointless.
1680 */ 1695 */
1681 if (!total_swap_pages) 1696 if (!total_swap_pages)
1682 return 0; 1697 return 0;
1683 1698
1684 if (scanning_global_lru(sc)) 1699 if (scanning_global_lru(sc))
1685 low = inactive_anon_is_low_global(zone); 1700 low = inactive_anon_is_low_global(zone);
1686 else 1701 else
1687 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1702 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1688 return low; 1703 return low;
1689 } 1704 }
1690 #else 1705 #else
1691 static inline int inactive_anon_is_low(struct zone *zone, 1706 static inline int inactive_anon_is_low(struct zone *zone,
1692 struct scan_control *sc) 1707 struct scan_control *sc)
1693 { 1708 {
1694 return 0; 1709 return 0;
1695 } 1710 }
1696 #endif 1711 #endif
1697 1712
1698 static int inactive_file_is_low_global(struct zone *zone) 1713 static int inactive_file_is_low_global(struct zone *zone)
1699 { 1714 {
1700 unsigned long active, inactive; 1715 unsigned long active, inactive;
1701 1716
1702 active = zone_page_state(zone, NR_ACTIVE_FILE); 1717 active = zone_page_state(zone, NR_ACTIVE_FILE);
1703 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1718 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1704 1719
1705 return (active > inactive); 1720 return (active > inactive);
1706 } 1721 }
1707 1722
1708 /** 1723 /**
1709 * inactive_file_is_low - check if file pages need to be deactivated 1724 * inactive_file_is_low - check if file pages need to be deactivated
1710 * @zone: zone to check 1725 * @zone: zone to check
1711 * @sc: scan control of this context 1726 * @sc: scan control of this context
1712 * 1727 *
1713 * When the system is doing streaming IO, memory pressure here 1728 * When the system is doing streaming IO, memory pressure here
1714 * ensures that active file pages get deactivated, until more 1729 * ensures that active file pages get deactivated, until more
1715 * than half of the file pages are on the inactive list. 1730 * than half of the file pages are on the inactive list.
1716 * 1731 *
1717 * Once we get to that situation, protect the system's working 1732 * Once we get to that situation, protect the system's working
1718 * set from being evicted by disabling active file page aging. 1733 * set from being evicted by disabling active file page aging.
1719 * 1734 *
1720 * This uses a different ratio than the anonymous pages, because 1735 * This uses a different ratio than the anonymous pages, because
1721 * the page cache uses a use-once replacement algorithm. 1736 * the page cache uses a use-once replacement algorithm.
1722 */ 1737 */
1723 static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) 1738 static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1724 { 1739 {
1725 int low; 1740 int low;
1726 1741
1727 if (scanning_global_lru(sc)) 1742 if (scanning_global_lru(sc))
1728 low = inactive_file_is_low_global(zone); 1743 low = inactive_file_is_low_global(zone);
1729 else 1744 else
1730 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1745 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1731 return low; 1746 return low;
1732 } 1747 }
1733 1748
1734 static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, 1749 static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1735 int file) 1750 int file)
1736 { 1751 {
1737 if (file) 1752 if (file)
1738 return inactive_file_is_low(zone, sc); 1753 return inactive_file_is_low(zone, sc);
1739 else 1754 else
1740 return inactive_anon_is_low(zone, sc); 1755 return inactive_anon_is_low(zone, sc);
1741 } 1756 }
1742 1757
1743 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1758 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1744 struct zone *zone, struct scan_control *sc, int priority) 1759 struct zone *zone, struct scan_control *sc, int priority)
1745 { 1760 {
1746 int file = is_file_lru(lru); 1761 int file = is_file_lru(lru);
1747 1762
1748 if (is_active_lru(lru)) { 1763 if (is_active_lru(lru)) {
1749 if (inactive_list_is_low(zone, sc, file)) 1764 if (inactive_list_is_low(zone, sc, file))
1750 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1765 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1751 return 0; 1766 return 0;
1752 } 1767 }
1753 1768
1754 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1769 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1755 } 1770 }
1756 1771
1757 /* 1772 /*
1758 * Determine how aggressively the anon and file LRU lists should be 1773 * Determine how aggressively the anon and file LRU lists should be
1759 * scanned. The relative value of each set of LRU lists is determined 1774 * scanned. The relative value of each set of LRU lists is determined
1760 * by looking at the fraction of the pages scanned we did rotate back 1775 * by looking at the fraction of the pages scanned we did rotate back
1761 * onto the active list instead of evict. 1776 * onto the active list instead of evict.
1762 * 1777 *
1763 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1778 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1764 */ 1779 */
1765 static void get_scan_count(struct zone *zone, struct scan_control *sc, 1780 static void get_scan_count(struct zone *zone, struct scan_control *sc,
1766 unsigned long *nr, int priority) 1781 unsigned long *nr, int priority)
1767 { 1782 {
1768 unsigned long anon, file, free; 1783 unsigned long anon, file, free;
1769 unsigned long anon_prio, file_prio; 1784 unsigned long anon_prio, file_prio;
1770 unsigned long ap, fp; 1785 unsigned long ap, fp;
1771 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1786 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1772 u64 fraction[2], denominator; 1787 u64 fraction[2], denominator;
1773 enum lru_list l; 1788 enum lru_list l;
1774 int noswap = 0; 1789 int noswap = 0;
1775 int force_scan = 0; 1790 int force_scan = 0;
1776 1791
1777 1792
1778 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1793 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1779 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1794 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1780 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1795 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1781 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1796 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1782 1797
1783 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1798 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1784 /* kswapd does zone balancing and need to scan this zone */ 1799 /* kswapd does zone balancing and need to scan this zone */
1785 if (scanning_global_lru(sc) && current_is_kswapd()) 1800 if (scanning_global_lru(sc) && current_is_kswapd())
1786 force_scan = 1; 1801 force_scan = 1;
1787 /* memcg may have small limit and need to avoid priority drop */ 1802 /* memcg may have small limit and need to avoid priority drop */
1788 if (!scanning_global_lru(sc)) 1803 if (!scanning_global_lru(sc))
1789 force_scan = 1; 1804 force_scan = 1;
1790 } 1805 }
1791 1806
1792 /* If we have no swap space, do not bother scanning anon pages. */ 1807 /* If we have no swap space, do not bother scanning anon pages. */
1793 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1808 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1794 noswap = 1; 1809 noswap = 1;
1795 fraction[0] = 0; 1810 fraction[0] = 0;
1796 fraction[1] = 1; 1811 fraction[1] = 1;
1797 denominator = 1; 1812 denominator = 1;
1798 goto out; 1813 goto out;
1799 } 1814 }
1800 1815
1801 if (scanning_global_lru(sc)) { 1816 if (scanning_global_lru(sc)) {
1802 free = zone_page_state(zone, NR_FREE_PAGES); 1817 free = zone_page_state(zone, NR_FREE_PAGES);
1803 /* If we have very few page cache pages, 1818 /* If we have very few page cache pages,
1804 force-scan anon pages. */ 1819 force-scan anon pages. */
1805 if (unlikely(file + free <= high_wmark_pages(zone))) { 1820 if (unlikely(file + free <= high_wmark_pages(zone))) {
1806 fraction[0] = 1; 1821 fraction[0] = 1;
1807 fraction[1] = 0; 1822 fraction[1] = 0;
1808 denominator = 1; 1823 denominator = 1;
1809 goto out; 1824 goto out;
1810 } 1825 }
1811 } 1826 }
1812 1827
1813 /* 1828 /*
1814 * With swappiness at 100, anonymous and file have the same priority. 1829 * With swappiness at 100, anonymous and file have the same priority.
1815 * This scanning priority is essentially the inverse of IO cost. 1830 * This scanning priority is essentially the inverse of IO cost.
1816 */ 1831 */
1817 anon_prio = sc->swappiness; 1832 anon_prio = sc->swappiness;
1818 file_prio = 200 - sc->swappiness; 1833 file_prio = 200 - sc->swappiness;
1819 1834
1820 /* 1835 /*
1821 * OK, so we have swap space and a fair amount of page cache 1836 * OK, so we have swap space and a fair amount of page cache
1822 * pages. We use the recently rotated / recently scanned 1837 * pages. We use the recently rotated / recently scanned
1823 * ratios to determine how valuable each cache is. 1838 * ratios to determine how valuable each cache is.
1824 * 1839 *
1825 * Because workloads change over time (and to avoid overflow) 1840 * Because workloads change over time (and to avoid overflow)
1826 * we keep these statistics as a floating average, which ends 1841 * we keep these statistics as a floating average, which ends
1827 * up weighing recent references more than old ones. 1842 * up weighing recent references more than old ones.
1828 * 1843 *
1829 * anon in [0], file in [1] 1844 * anon in [0], file in [1]
1830 */ 1845 */
1831 spin_lock_irq(&zone->lru_lock); 1846 spin_lock_irq(&zone->lru_lock);
1832 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1847 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1833 reclaim_stat->recent_scanned[0] /= 2; 1848 reclaim_stat->recent_scanned[0] /= 2;
1834 reclaim_stat->recent_rotated[0] /= 2; 1849 reclaim_stat->recent_rotated[0] /= 2;
1835 } 1850 }
1836 1851
1837 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1852 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1838 reclaim_stat->recent_scanned[1] /= 2; 1853 reclaim_stat->recent_scanned[1] /= 2;
1839 reclaim_stat->recent_rotated[1] /= 2; 1854 reclaim_stat->recent_rotated[1] /= 2;
1840 } 1855 }
1841 1856
1842 /* 1857 /*
1843 * The amount of pressure on anon vs file pages is inversely 1858 * The amount of pressure on anon vs file pages is inversely
1844 * proportional to the fraction of recently scanned pages on 1859 * proportional to the fraction of recently scanned pages on
1845 * each list that were recently referenced and in active use. 1860 * each list that were recently referenced and in active use.
1846 */ 1861 */
1847 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1862 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1848 ap /= reclaim_stat->recent_rotated[0] + 1; 1863 ap /= reclaim_stat->recent_rotated[0] + 1;
1849 1864
1850 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1865 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1851 fp /= reclaim_stat->recent_rotated[1] + 1; 1866 fp /= reclaim_stat->recent_rotated[1] + 1;
1852 spin_unlock_irq(&zone->lru_lock); 1867 spin_unlock_irq(&zone->lru_lock);
1853 1868
1854 fraction[0] = ap; 1869 fraction[0] = ap;
1855 fraction[1] = fp; 1870 fraction[1] = fp;
1856 denominator = ap + fp + 1; 1871 denominator = ap + fp + 1;
1857 out: 1872 out:
1858 for_each_evictable_lru(l) { 1873 for_each_evictable_lru(l) {
1859 int file = is_file_lru(l); 1874 int file = is_file_lru(l);
1860 unsigned long scan; 1875 unsigned long scan;
1861 1876
1862 scan = zone_nr_lru_pages(zone, sc, l); 1877 scan = zone_nr_lru_pages(zone, sc, l);
1863 if (priority || noswap) { 1878 if (priority || noswap) {
1864 scan >>= priority; 1879 scan >>= priority;
1865 scan = div64_u64(scan * fraction[file], denominator); 1880 scan = div64_u64(scan * fraction[file], denominator);
1866 } 1881 }
1867 1882
1868 /* 1883 /*
1869 * If zone is small or memcg is small, nr[l] can be 0. 1884 * If zone is small or memcg is small, nr[l] can be 0.
1870 * This results no-scan on this priority and priority drop down. 1885 * This results no-scan on this priority and priority drop down.
1871 * For global direct reclaim, it can visit next zone and tend 1886 * For global direct reclaim, it can visit next zone and tend
1872 * not to have problems. For global kswapd, it's for zone 1887 * not to have problems. For global kswapd, it's for zone
1873 * balancing and it need to scan a small amounts. When using 1888 * balancing and it need to scan a small amounts. When using
1874 * memcg, priority drop can cause big latency. So, it's better 1889 * memcg, priority drop can cause big latency. So, it's better
1875 * to scan small amount. See may_noscan above. 1890 * to scan small amount. See may_noscan above.
1876 */ 1891 */
1877 if (!scan && force_scan) { 1892 if (!scan && force_scan) {
1878 if (file) 1893 if (file)
1879 scan = SWAP_CLUSTER_MAX; 1894 scan = SWAP_CLUSTER_MAX;
1880 else if (!noswap) 1895 else if (!noswap)
1881 scan = SWAP_CLUSTER_MAX; 1896 scan = SWAP_CLUSTER_MAX;
1882 } 1897 }
1883 nr[l] = scan; 1898 nr[l] = scan;
1884 } 1899 }
1885 } 1900 }
1886 1901
1887 /* 1902 /*
1888 * Reclaim/compaction depends on a number of pages being freed. To avoid 1903 * Reclaim/compaction depends on a number of pages being freed. To avoid
1889 * disruption to the system, a small number of order-0 pages continue to be 1904 * disruption to the system, a small number of order-0 pages continue to be
1890 * rotated and reclaimed in the normal fashion. However, by the time we get 1905 * rotated and reclaimed in the normal fashion. However, by the time we get
1891 * back to the allocator and call try_to_compact_zone(), we ensure that 1906 * back to the allocator and call try_to_compact_zone(), we ensure that
1892 * there are enough free pages for it to be likely successful 1907 * there are enough free pages for it to be likely successful
1893 */ 1908 */
1894 static inline bool should_continue_reclaim(struct zone *zone, 1909 static inline bool should_continue_reclaim(struct zone *zone,
1895 unsigned long nr_reclaimed, 1910 unsigned long nr_reclaimed,
1896 unsigned long nr_scanned, 1911 unsigned long nr_scanned,
1897 struct scan_control *sc) 1912 struct scan_control *sc)
1898 { 1913 {
1899 unsigned long pages_for_compaction; 1914 unsigned long pages_for_compaction;
1900 unsigned long inactive_lru_pages; 1915 unsigned long inactive_lru_pages;
1901 1916
1902 /* If not in reclaim/compaction mode, stop */ 1917 /* If not in reclaim/compaction mode, stop */
1903 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1918 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1904 return false; 1919 return false;
1905 1920
1906 /* Consider stopping depending on scan and reclaim activity */ 1921 /* Consider stopping depending on scan and reclaim activity */
1907 if (sc->gfp_mask & __GFP_REPEAT) { 1922 if (sc->gfp_mask & __GFP_REPEAT) {
1908 /* 1923 /*
1909 * For __GFP_REPEAT allocations, stop reclaiming if the 1924 * For __GFP_REPEAT allocations, stop reclaiming if the
1910 * full LRU list has been scanned and we are still failing 1925 * full LRU list has been scanned and we are still failing
1911 * to reclaim pages. This full LRU scan is potentially 1926 * to reclaim pages. This full LRU scan is potentially
1912 * expensive but a __GFP_REPEAT caller really wants to succeed 1927 * expensive but a __GFP_REPEAT caller really wants to succeed
1913 */ 1928 */
1914 if (!nr_reclaimed && !nr_scanned) 1929 if (!nr_reclaimed && !nr_scanned)
1915 return false; 1930 return false;
1916 } else { 1931 } else {
1917 /* 1932 /*
1918 * For non-__GFP_REPEAT allocations which can presumably 1933 * For non-__GFP_REPEAT allocations which can presumably
1919 * fail without consequence, stop if we failed to reclaim 1934 * fail without consequence, stop if we failed to reclaim
1920 * any pages from the last SWAP_CLUSTER_MAX number of 1935 * any pages from the last SWAP_CLUSTER_MAX number of
1921 * pages that were scanned. This will return to the 1936 * pages that were scanned. This will return to the
1922 * caller faster at the risk reclaim/compaction and 1937 * caller faster at the risk reclaim/compaction and
1923 * the resulting allocation attempt fails 1938 * the resulting allocation attempt fails
1924 */ 1939 */
1925 if (!nr_reclaimed) 1940 if (!nr_reclaimed)
1926 return false; 1941 return false;
1927 } 1942 }
1928 1943
1929 /* 1944 /*
1930 * If we have not reclaimed enough pages for compaction and the 1945 * If we have not reclaimed enough pages for compaction and the
1931 * inactive lists are large enough, continue reclaiming 1946 * inactive lists are large enough, continue reclaiming
1932 */ 1947 */
1933 pages_for_compaction = (2UL << sc->order); 1948 pages_for_compaction = (2UL << sc->order);
1934 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 1949 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1935 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1950 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1936 if (sc->nr_reclaimed < pages_for_compaction && 1951 if (sc->nr_reclaimed < pages_for_compaction &&
1937 inactive_lru_pages > pages_for_compaction) 1952 inactive_lru_pages > pages_for_compaction)
1938 return true; 1953 return true;
1939 1954
1940 /* If compaction would go ahead or the allocation would succeed, stop */ 1955 /* If compaction would go ahead or the allocation would succeed, stop */
1941 switch (compaction_suitable(zone, sc->order)) { 1956 switch (compaction_suitable(zone, sc->order)) {
1942 case COMPACT_PARTIAL: 1957 case COMPACT_PARTIAL:
1943 case COMPACT_CONTINUE: 1958 case COMPACT_CONTINUE:
1944 return false; 1959 return false;
1945 default: 1960 default:
1946 return true; 1961 return true;
1947 } 1962 }
1948 } 1963 }
1949 1964
1950 /* 1965 /*
1951 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1966 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1952 */ 1967 */
1953 static void shrink_zone(int priority, struct zone *zone, 1968 static void shrink_zone(int priority, struct zone *zone,
1954 struct scan_control *sc) 1969 struct scan_control *sc)
1955 { 1970 {
1956 unsigned long nr[NR_LRU_LISTS]; 1971 unsigned long nr[NR_LRU_LISTS];
1957 unsigned long nr_to_scan; 1972 unsigned long nr_to_scan;
1958 enum lru_list l; 1973 enum lru_list l;
1959 unsigned long nr_reclaimed, nr_scanned; 1974 unsigned long nr_reclaimed, nr_scanned;
1960 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1975 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1961 1976
1962 restart: 1977 restart:
1963 nr_reclaimed = 0; 1978 nr_reclaimed = 0;
1964 nr_scanned = sc->nr_scanned; 1979 nr_scanned = sc->nr_scanned;
1965 get_scan_count(zone, sc, nr, priority); 1980 get_scan_count(zone, sc, nr, priority);
1966 1981
1967 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1982 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1968 nr[LRU_INACTIVE_FILE]) { 1983 nr[LRU_INACTIVE_FILE]) {
1969 for_each_evictable_lru(l) { 1984 for_each_evictable_lru(l) {
1970 if (nr[l]) { 1985 if (nr[l]) {
1971 nr_to_scan = min_t(unsigned long, 1986 nr_to_scan = min_t(unsigned long,
1972 nr[l], SWAP_CLUSTER_MAX); 1987 nr[l], SWAP_CLUSTER_MAX);
1973 nr[l] -= nr_to_scan; 1988 nr[l] -= nr_to_scan;
1974 1989
1975 nr_reclaimed += shrink_list(l, nr_to_scan, 1990 nr_reclaimed += shrink_list(l, nr_to_scan,
1976 zone, sc, priority); 1991 zone, sc, priority);
1977 } 1992 }
1978 } 1993 }
1979 /* 1994 /*
1980 * On large memory systems, scan >> priority can become 1995 * On large memory systems, scan >> priority can become
1981 * really large. This is fine for the starting priority; 1996 * really large. This is fine for the starting priority;
1982 * we want to put equal scanning pressure on each zone. 1997 * we want to put equal scanning pressure on each zone.
1983 * However, if the VM has a harder time of freeing pages, 1998 * However, if the VM has a harder time of freeing pages,
1984 * with multiple processes reclaiming pages, the total 1999 * with multiple processes reclaiming pages, the total
1985 * freeing target can get unreasonably large. 2000 * freeing target can get unreasonably large.
1986 */ 2001 */
1987 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2002 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1988 break; 2003 break;
1989 } 2004 }
1990 sc->nr_reclaimed += nr_reclaimed; 2005 sc->nr_reclaimed += nr_reclaimed;
1991 2006
1992 /* 2007 /*
1993 * Even if we did not try to evict anon pages at all, we want to 2008 * Even if we did not try to evict anon pages at all, we want to
1994 * rebalance the anon lru active/inactive ratio. 2009 * rebalance the anon lru active/inactive ratio.
1995 */ 2010 */
1996 if (inactive_anon_is_low(zone, sc)) 2011 if (inactive_anon_is_low(zone, sc))
1997 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 2012 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1998 2013
1999 /* reclaim/compaction might need reclaim to continue */ 2014 /* reclaim/compaction might need reclaim to continue */
2000 if (should_continue_reclaim(zone, nr_reclaimed, 2015 if (should_continue_reclaim(zone, nr_reclaimed,
2001 sc->nr_scanned - nr_scanned, sc)) 2016 sc->nr_scanned - nr_scanned, sc))
2002 goto restart; 2017 goto restart;
2003 2018
2004 throttle_vm_writeout(sc->gfp_mask); 2019 throttle_vm_writeout(sc->gfp_mask);
2005 } 2020 }
2006 2021
2007 /* 2022 /*
2008 * This is the direct reclaim path, for page-allocating processes. We only 2023 * This is the direct reclaim path, for page-allocating processes. We only
2009 * try to reclaim pages from zones which will satisfy the caller's allocation 2024 * try to reclaim pages from zones which will satisfy the caller's allocation
2010 * request. 2025 * request.
2011 * 2026 *
2012 * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2027 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2013 * Because: 2028 * Because:
2014 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2029 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2015 * allocation or 2030 * allocation or
2016 * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2031 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2017 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2032 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2018 * zone defense algorithm. 2033 * zone defense algorithm.
2019 * 2034 *
2020 * If a zone is deemed to be full of pinned pages then just give it a light 2035 * If a zone is deemed to be full of pinned pages then just give it a light
2021 * scan then give up on it. 2036 * scan then give up on it.
2022 */ 2037 */
2023 static void shrink_zones(int priority, struct zonelist *zonelist, 2038 static void shrink_zones(int priority, struct zonelist *zonelist,
2024 struct scan_control *sc) 2039 struct scan_control *sc)
2025 { 2040 {
2026 struct zoneref *z; 2041 struct zoneref *z;
2027 struct zone *zone; 2042 struct zone *zone;
2028 unsigned long nr_soft_reclaimed; 2043 unsigned long nr_soft_reclaimed;
2029 unsigned long nr_soft_scanned; 2044 unsigned long nr_soft_scanned;
2030 2045
2031 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2046 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2032 gfp_zone(sc->gfp_mask), sc->nodemask) { 2047 gfp_zone(sc->gfp_mask), sc->nodemask) {
2033 if (!populated_zone(zone)) 2048 if (!populated_zone(zone))
2034 continue; 2049 continue;
2035 /* 2050 /*
2036 * Take care memory controller reclaiming has small influence 2051 * Take care memory controller reclaiming has small influence
2037 * to global LRU. 2052 * to global LRU.
2038 */ 2053 */
2039 if (scanning_global_lru(sc)) { 2054 if (scanning_global_lru(sc)) {
2040 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2055 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2041 continue; 2056 continue;
2042 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2057 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2043 continue; /* Let kswapd poll it */ 2058 continue; /* Let kswapd poll it */
2044 /* 2059 /*
2045 * This steals pages from memory cgroups over softlimit 2060 * This steals pages from memory cgroups over softlimit
2046 * and returns the number of reclaimed pages and 2061 * and returns the number of reclaimed pages and
2047 * scanned pages. This works for global memory pressure 2062 * scanned pages. This works for global memory pressure
2048 * and balancing, not for a memcg's limit. 2063 * and balancing, not for a memcg's limit.
2049 */ 2064 */
2050 nr_soft_scanned = 0; 2065 nr_soft_scanned = 0;
2051 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2066 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2052 sc->order, sc->gfp_mask, 2067 sc->order, sc->gfp_mask,
2053 &nr_soft_scanned); 2068 &nr_soft_scanned);
2054 sc->nr_reclaimed += nr_soft_reclaimed; 2069 sc->nr_reclaimed += nr_soft_reclaimed;
2055 sc->nr_scanned += nr_soft_scanned; 2070 sc->nr_scanned += nr_soft_scanned;
2056 /* need some check for avoid more shrink_zone() */ 2071 /* need some check for avoid more shrink_zone() */
2057 } 2072 }
2058 2073
2059 shrink_zone(priority, zone, sc); 2074 shrink_zone(priority, zone, sc);
2060 } 2075 }
2061 } 2076 }
2062 2077
2063 static bool zone_reclaimable(struct zone *zone) 2078 static bool zone_reclaimable(struct zone *zone)
2064 { 2079 {
2065 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2080 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2066 } 2081 }
2067 2082
2068 /* All zones in zonelist are unreclaimable? */ 2083 /* All zones in zonelist are unreclaimable? */
2069 static bool all_unreclaimable(struct zonelist *zonelist, 2084 static bool all_unreclaimable(struct zonelist *zonelist,
2070 struct scan_control *sc) 2085 struct scan_control *sc)
2071 { 2086 {
2072 struct zoneref *z; 2087 struct zoneref *z;
2073 struct zone *zone; 2088 struct zone *zone;
2074 2089
2075 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2090 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2076 gfp_zone(sc->gfp_mask), sc->nodemask) { 2091 gfp_zone(sc->gfp_mask), sc->nodemask) {
2077 if (!populated_zone(zone)) 2092 if (!populated_zone(zone))
2078 continue; 2093 continue;
2079 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2094 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2080 continue; 2095 continue;
2081 if (!zone->all_unreclaimable) 2096 if (!zone->all_unreclaimable)
2082 return false; 2097 return false;
2083 } 2098 }
2084 2099
2085 return true; 2100 return true;
2086 } 2101 }
2087 2102
2088 /* 2103 /*
2089 * This is the main entry point to direct page reclaim. 2104 * This is the main entry point to direct page reclaim.
2090 * 2105 *
2091 * If a full scan of the inactive list fails to free enough memory then we 2106 * If a full scan of the inactive list fails to free enough memory then we
2092 * are "out of memory" and something needs to be killed. 2107 * are "out of memory" and something needs to be killed.
2093 * 2108 *
2094 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2109 * If the caller is !__GFP_FS then the probability of a failure is reasonably
2095 * high - the zone may be full of dirty or under-writeback pages, which this 2110 * high - the zone may be full of dirty or under-writeback pages, which this
2096 * caller can't do much about. We kick the writeback threads and take explicit 2111 * caller can't do much about. We kick the writeback threads and take explicit
2097 * naps in the hope that some of these pages can be written. But if the 2112 * naps in the hope that some of these pages can be written. But if the
2098 * allocating task holds filesystem locks which prevent writeout this might not 2113 * allocating task holds filesystem locks which prevent writeout this might not
2099 * work, and the allocation attempt will fail. 2114 * work, and the allocation attempt will fail.
2100 * 2115 *
2101 * returns: 0, if no pages reclaimed 2116 * returns: 0, if no pages reclaimed
2102 * else, the number of pages reclaimed 2117 * else, the number of pages reclaimed
2103 */ 2118 */
2104 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2119 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2105 struct scan_control *sc, 2120 struct scan_control *sc,
2106 struct shrink_control *shrink) 2121 struct shrink_control *shrink)
2107 { 2122 {
2108 int priority; 2123 int priority;
2109 unsigned long total_scanned = 0; 2124 unsigned long total_scanned = 0;
2110 struct reclaim_state *reclaim_state = current->reclaim_state; 2125 struct reclaim_state *reclaim_state = current->reclaim_state;
2111 struct zoneref *z; 2126 struct zoneref *z;
2112 struct zone *zone; 2127 struct zone *zone;
2113 unsigned long writeback_threshold; 2128 unsigned long writeback_threshold;
2114 2129
2115 get_mems_allowed(); 2130 get_mems_allowed();
2116 delayacct_freepages_start(); 2131 delayacct_freepages_start();
2117 2132
2118 if (scanning_global_lru(sc)) 2133 if (scanning_global_lru(sc))
2119 count_vm_event(ALLOCSTALL); 2134 count_vm_event(ALLOCSTALL);
2120 2135
2121 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2136 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2122 sc->nr_scanned = 0; 2137 sc->nr_scanned = 0;
2123 if (!priority) 2138 if (!priority)
2124 disable_swap_token(sc->mem_cgroup); 2139 disable_swap_token(sc->mem_cgroup);
2125 shrink_zones(priority, zonelist, sc); 2140 shrink_zones(priority, zonelist, sc);
2126 /* 2141 /*
2127 * Don't shrink slabs when reclaiming memory from 2142 * Don't shrink slabs when reclaiming memory from
2128 * over limit cgroups 2143 * over limit cgroups
2129 */ 2144 */
2130 if (scanning_global_lru(sc)) { 2145 if (scanning_global_lru(sc)) {
2131 unsigned long lru_pages = 0; 2146 unsigned long lru_pages = 0;
2132 for_each_zone_zonelist(zone, z, zonelist, 2147 for_each_zone_zonelist(zone, z, zonelist,
2133 gfp_zone(sc->gfp_mask)) { 2148 gfp_zone(sc->gfp_mask)) {
2134 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2149 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2135 continue; 2150 continue;
2136 2151
2137 lru_pages += zone_reclaimable_pages(zone); 2152 lru_pages += zone_reclaimable_pages(zone);
2138 } 2153 }
2139 2154
2140 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2155 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2141 if (reclaim_state) { 2156 if (reclaim_state) {
2142 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2157 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2143 reclaim_state->reclaimed_slab = 0; 2158 reclaim_state->reclaimed_slab = 0;
2144 } 2159 }
2145 } 2160 }
2146 total_scanned += sc->nr_scanned; 2161 total_scanned += sc->nr_scanned;
2147 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2162 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2148 goto out; 2163 goto out;
2149 2164
2150 /* 2165 /*
2151 * Try to write back as many pages as we just scanned. This 2166 * Try to write back as many pages as we just scanned. This
2152 * tends to cause slow streaming writers to write data to the 2167 * tends to cause slow streaming writers to write data to the
2153 * disk smoothly, at the dirtying rate, which is nice. But 2168 * disk smoothly, at the dirtying rate, which is nice. But
2154 * that's undesirable in laptop mode, where we *want* lumpy 2169 * that's undesirable in laptop mode, where we *want* lumpy
2155 * writeout. So in laptop mode, write out the whole world. 2170 * writeout. So in laptop mode, write out the whole world.
2156 */ 2171 */
2157 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2172 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2158 if (total_scanned > writeback_threshold) { 2173 if (total_scanned > writeback_threshold) {
2159 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2174 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
2160 sc->may_writepage = 1; 2175 sc->may_writepage = 1;
2161 } 2176 }
2162 2177
2163 /* Take a nap, wait for some writeback to complete */ 2178 /* Take a nap, wait for some writeback to complete */
2164 if (!sc->hibernation_mode && sc->nr_scanned && 2179 if (!sc->hibernation_mode && sc->nr_scanned &&
2165 priority < DEF_PRIORITY - 2) { 2180 priority < DEF_PRIORITY - 2) {
2166 struct zone *preferred_zone; 2181 struct zone *preferred_zone;
2167 2182
2168 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2183 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2169 &cpuset_current_mems_allowed, 2184 &cpuset_current_mems_allowed,
2170 &preferred_zone); 2185 &preferred_zone);
2171 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2186 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2172 } 2187 }
2173 } 2188 }
2174 2189
2175 out: 2190 out:
2176 delayacct_freepages_end(); 2191 delayacct_freepages_end();
2177 put_mems_allowed(); 2192 put_mems_allowed();
2178 2193
2179 if (sc->nr_reclaimed) 2194 if (sc->nr_reclaimed)
2180 return sc->nr_reclaimed; 2195 return sc->nr_reclaimed;
2181 2196
2182 /* 2197 /*
2183 * As hibernation is going on, kswapd is freezed so that it can't mark 2198 * As hibernation is going on, kswapd is freezed so that it can't mark
2184 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2199 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2185 * check. 2200 * check.
2186 */ 2201 */
2187 if (oom_killer_disabled) 2202 if (oom_killer_disabled)
2188 return 0; 2203 return 0;
2189 2204
2190 /* top priority shrink_zones still had more to do? don't OOM, then */ 2205 /* top priority shrink_zones still had more to do? don't OOM, then */
2191 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2206 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2192 return 1; 2207 return 1;
2193 2208
2194 return 0; 2209 return 0;
2195 } 2210 }
2196 2211
2197 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2212 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2198 gfp_t gfp_mask, nodemask_t *nodemask) 2213 gfp_t gfp_mask, nodemask_t *nodemask)
2199 { 2214 {
2200 unsigned long nr_reclaimed; 2215 unsigned long nr_reclaimed;
2201 struct scan_control sc = { 2216 struct scan_control sc = {
2202 .gfp_mask = gfp_mask, 2217 .gfp_mask = gfp_mask,
2203 .may_writepage = !laptop_mode, 2218 .may_writepage = !laptop_mode,
2204 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2219 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2205 .may_unmap = 1, 2220 .may_unmap = 1,
2206 .may_swap = 1, 2221 .may_swap = 1,
2207 .swappiness = vm_swappiness, 2222 .swappiness = vm_swappiness,
2208 .order = order, 2223 .order = order,
2209 .mem_cgroup = NULL, 2224 .mem_cgroup = NULL,
2210 .nodemask = nodemask, 2225 .nodemask = nodemask,
2211 }; 2226 };
2212 struct shrink_control shrink = { 2227 struct shrink_control shrink = {
2213 .gfp_mask = sc.gfp_mask, 2228 .gfp_mask = sc.gfp_mask,
2214 }; 2229 };
2215 2230
2216 trace_mm_vmscan_direct_reclaim_begin(order, 2231 trace_mm_vmscan_direct_reclaim_begin(order,
2217 sc.may_writepage, 2232 sc.may_writepage,
2218 gfp_mask); 2233 gfp_mask);
2219 2234
2220 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2235 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2221 2236
2222 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2237 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2223 2238
2224 return nr_reclaimed; 2239 return nr_reclaimed;
2225 } 2240 }
2226 2241
2227 #ifdef CONFIG_CGROUP_MEM_RES_CTLR 2242 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
2228 2243
2229 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2244 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2230 gfp_t gfp_mask, bool noswap, 2245 gfp_t gfp_mask, bool noswap,
2231 unsigned int swappiness, 2246 unsigned int swappiness,
2232 struct zone *zone, 2247 struct zone *zone,
2233 unsigned long *nr_scanned) 2248 unsigned long *nr_scanned)
2234 { 2249 {
2235 struct scan_control sc = { 2250 struct scan_control sc = {
2236 .nr_scanned = 0, 2251 .nr_scanned = 0,
2237 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2252 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2238 .may_writepage = !laptop_mode, 2253 .may_writepage = !laptop_mode,
2239 .may_unmap = 1, 2254 .may_unmap = 1,
2240 .may_swap = !noswap, 2255 .may_swap = !noswap,
2241 .swappiness = swappiness, 2256 .swappiness = swappiness,
2242 .order = 0, 2257 .order = 0,
2243 .mem_cgroup = mem, 2258 .mem_cgroup = mem,
2244 }; 2259 };
2245 2260
2246 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2261 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2247 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2262 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2248 2263
2249 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2264 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2250 sc.may_writepage, 2265 sc.may_writepage,
2251 sc.gfp_mask); 2266 sc.gfp_mask);
2252 2267
2253 /* 2268 /*
2254 * NOTE: Although we can get the priority field, using it 2269 * NOTE: Although we can get the priority field, using it
2255 * here is not a good idea, since it limits the pages we can scan. 2270 * here is not a good idea, since it limits the pages we can scan.
2256 * if we don't reclaim here, the shrink_zone from balance_pgdat 2271 * if we don't reclaim here, the shrink_zone from balance_pgdat
2257 * will pick up pages from other mem cgroup's as well. We hack 2272 * will pick up pages from other mem cgroup's as well. We hack
2258 * the priority and make it zero. 2273 * the priority and make it zero.
2259 */ 2274 */
2260 shrink_zone(0, zone, &sc); 2275 shrink_zone(0, zone, &sc);
2261 2276
2262 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2277 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2263 2278
2264 *nr_scanned = sc.nr_scanned; 2279 *nr_scanned = sc.nr_scanned;
2265 return sc.nr_reclaimed; 2280 return sc.nr_reclaimed;
2266 } 2281 }
2267 2282
2268 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2283 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2269 gfp_t gfp_mask, 2284 gfp_t gfp_mask,
2270 bool noswap, 2285 bool noswap,
2271 unsigned int swappiness) 2286 unsigned int swappiness)
2272 { 2287 {
2273 struct zonelist *zonelist; 2288 struct zonelist *zonelist;
2274 unsigned long nr_reclaimed; 2289 unsigned long nr_reclaimed;
2275 int nid; 2290 int nid;
2276 struct scan_control sc = { 2291 struct scan_control sc = {
2277 .may_writepage = !laptop_mode, 2292 .may_writepage = !laptop_mode,
2278 .may_unmap = 1, 2293 .may_unmap = 1,
2279 .may_swap = !noswap, 2294 .may_swap = !noswap,
2280 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2295 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2281 .swappiness = swappiness, 2296 .swappiness = swappiness,
2282 .order = 0, 2297 .order = 0,
2283 .mem_cgroup = mem_cont, 2298 .mem_cgroup = mem_cont,
2284 .nodemask = NULL, /* we don't care the placement */ 2299 .nodemask = NULL, /* we don't care the placement */
2285 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2300 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2286 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2301 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2287 }; 2302 };
2288 struct shrink_control shrink = { 2303 struct shrink_control shrink = {
2289 .gfp_mask = sc.gfp_mask, 2304 .gfp_mask = sc.gfp_mask,
2290 }; 2305 };
2291 2306
2292 /* 2307 /*
2293 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2308 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2294 * take care of from where we get pages. So the node where we start the 2309 * take care of from where we get pages. So the node where we start the
2295 * scan does not need to be the current node. 2310 * scan does not need to be the current node.
2296 */ 2311 */
2297 nid = mem_cgroup_select_victim_node(mem_cont); 2312 nid = mem_cgroup_select_victim_node(mem_cont);
2298 2313
2299 zonelist = NODE_DATA(nid)->node_zonelists; 2314 zonelist = NODE_DATA(nid)->node_zonelists;
2300 2315
2301 trace_mm_vmscan_memcg_reclaim_begin(0, 2316 trace_mm_vmscan_memcg_reclaim_begin(0,
2302 sc.may_writepage, 2317 sc.may_writepage,
2303 sc.gfp_mask); 2318 sc.gfp_mask);
2304 2319
2305 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2320 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2306 2321
2307 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2322 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2308 2323
2309 return nr_reclaimed; 2324 return nr_reclaimed;
2310 } 2325 }
2311 #endif 2326 #endif
2312 2327
2313 /* 2328 /*
2314 * pgdat_balanced is used when checking if a node is balanced for high-order 2329 * pgdat_balanced is used when checking if a node is balanced for high-order
2315 * allocations. Only zones that meet watermarks and are in a zone allowed 2330 * allocations. Only zones that meet watermarks and are in a zone allowed
2316 * by the callers classzone_idx are added to balanced_pages. The total of 2331 * by the callers classzone_idx are added to balanced_pages. The total of
2317 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2332 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2318 * for the node to be considered balanced. Forcing all zones to be balanced 2333 * for the node to be considered balanced. Forcing all zones to be balanced
2319 * for high orders can cause excessive reclaim when there are imbalanced zones. 2334 * for high orders can cause excessive reclaim when there are imbalanced zones.
2320 * The choice of 25% is due to 2335 * The choice of 25% is due to
2321 * o a 16M DMA zone that is balanced will not balance a zone on any 2336 * o a 16M DMA zone that is balanced will not balance a zone on any
2322 * reasonable sized machine 2337 * reasonable sized machine
2323 * o On all other machines, the top zone must be at least a reasonable 2338 * o On all other machines, the top zone must be at least a reasonable
2324 * percentage of the middle zones. For example, on 32-bit x86, highmem 2339 * percentage of the middle zones. For example, on 32-bit x86, highmem
2325 * would need to be at least 256M for it to be balance a whole node. 2340 * would need to be at least 256M for it to be balance a whole node.
2326 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2341 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2327 * to balance a node on its own. These seemed like reasonable ratios. 2342 * to balance a node on its own. These seemed like reasonable ratios.
2328 */ 2343 */
2329 static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2344 static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2330 int classzone_idx) 2345 int classzone_idx)
2331 { 2346 {
2332 unsigned long present_pages = 0; 2347 unsigned long present_pages = 0;
2333 int i; 2348 int i;
2334 2349
2335 for (i = 0; i <= classzone_idx; i++) 2350 for (i = 0; i <= classzone_idx; i++)
2336 present_pages += pgdat->node_zones[i].present_pages; 2351 present_pages += pgdat->node_zones[i].present_pages;
2337 2352
2338 /* A special case here: if zone has no page, we think it's balanced */ 2353 /* A special case here: if zone has no page, we think it's balanced */
2339 return balanced_pages >= (present_pages >> 2); 2354 return balanced_pages >= (present_pages >> 2);
2340 } 2355 }
2341 2356
2342 /* is kswapd sleeping prematurely? */ 2357 /* is kswapd sleeping prematurely? */
2343 static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2358 static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2344 int classzone_idx) 2359 int classzone_idx)
2345 { 2360 {
2346 int i; 2361 int i;
2347 unsigned long balanced = 0; 2362 unsigned long balanced = 0;
2348 bool all_zones_ok = true; 2363 bool all_zones_ok = true;
2349 2364
2350 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2365 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2351 if (remaining) 2366 if (remaining)
2352 return true; 2367 return true;
2353 2368
2354 /* Check the watermark levels */ 2369 /* Check the watermark levels */
2355 for (i = 0; i <= classzone_idx; i++) { 2370 for (i = 0; i <= classzone_idx; i++) {
2356 struct zone *zone = pgdat->node_zones + i; 2371 struct zone *zone = pgdat->node_zones + i;
2357 2372
2358 if (!populated_zone(zone)) 2373 if (!populated_zone(zone))
2359 continue; 2374 continue;
2360 2375
2361 /* 2376 /*
2362 * balance_pgdat() skips over all_unreclaimable after 2377 * balance_pgdat() skips over all_unreclaimable after
2363 * DEF_PRIORITY. Effectively, it considers them balanced so 2378 * DEF_PRIORITY. Effectively, it considers them balanced so
2364 * they must be considered balanced here as well if kswapd 2379 * they must be considered balanced here as well if kswapd
2365 * is to sleep 2380 * is to sleep
2366 */ 2381 */
2367 if (zone->all_unreclaimable) { 2382 if (zone->all_unreclaimable) {
2368 balanced += zone->present_pages; 2383 balanced += zone->present_pages;
2369 continue; 2384 continue;
2370 } 2385 }
2371 2386
2372 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2387 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2373 i, 0)) 2388 i, 0))
2374 all_zones_ok = false; 2389 all_zones_ok = false;
2375 else 2390 else
2376 balanced += zone->present_pages; 2391 balanced += zone->present_pages;
2377 } 2392 }
2378 2393
2379 /* 2394 /*
2380 * For high-order requests, the balanced zones must contain at least 2395 * For high-order requests, the balanced zones must contain at least
2381 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones 2396 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2382 * must be balanced 2397 * must be balanced
2383 */ 2398 */
2384 if (order) 2399 if (order)
2385 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2400 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2386 else 2401 else
2387 return !all_zones_ok; 2402 return !all_zones_ok;
2388 } 2403 }
2389 2404
2390 /* 2405 /*
2391 * For kswapd, balance_pgdat() will work across all this node's zones until 2406 * For kswapd, balance_pgdat() will work across all this node's zones until
2392 * they are all at high_wmark_pages(zone). 2407 * they are all at high_wmark_pages(zone).
2393 * 2408 *
2394 * Returns the final order kswapd was reclaiming at 2409 * Returns the final order kswapd was reclaiming at
2395 * 2410 *
2396 * There is special handling here for zones which are full of pinned pages. 2411 * There is special handling here for zones which are full of pinned pages.
2397 * This can happen if the pages are all mlocked, or if they are all used by 2412 * This can happen if the pages are all mlocked, or if they are all used by
2398 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2413 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
2399 * What we do is to detect the case where all pages in the zone have been 2414 * What we do is to detect the case where all pages in the zone have been
2400 * scanned twice and there has been zero successful reclaim. Mark the zone as 2415 * scanned twice and there has been zero successful reclaim. Mark the zone as
2401 * dead and from now on, only perform a short scan. Basically we're polling 2416 * dead and from now on, only perform a short scan. Basically we're polling
2402 * the zone for when the problem goes away. 2417 * the zone for when the problem goes away.
2403 * 2418 *
2404 * kswapd scans the zones in the highmem->normal->dma direction. It skips 2419 * kswapd scans the zones in the highmem->normal->dma direction. It skips
2405 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2420 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
2406 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2421 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
2407 * lower zones regardless of the number of free pages in the lower zones. This 2422 * lower zones regardless of the number of free pages in the lower zones. This
2408 * interoperates with the page allocator fallback scheme to ensure that aging 2423 * interoperates with the page allocator fallback scheme to ensure that aging
2409 * of pages is balanced across the zones. 2424 * of pages is balanced across the zones.
2410 */ 2425 */
2411 static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2426 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2412 int *classzone_idx) 2427 int *classzone_idx)
2413 { 2428 {
2414 int all_zones_ok; 2429 int all_zones_ok;
2415 unsigned long balanced; 2430 unsigned long balanced;
2416 int priority; 2431 int priority;
2417 int i; 2432 int i;
2418 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2433 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2419 unsigned long total_scanned; 2434 unsigned long total_scanned;
2420 struct reclaim_state *reclaim_state = current->reclaim_state; 2435 struct reclaim_state *reclaim_state = current->reclaim_state;
2421 unsigned long nr_soft_reclaimed; 2436 unsigned long nr_soft_reclaimed;
2422 unsigned long nr_soft_scanned; 2437 unsigned long nr_soft_scanned;
2423 struct scan_control sc = { 2438 struct scan_control sc = {
2424 .gfp_mask = GFP_KERNEL, 2439 .gfp_mask = GFP_KERNEL,
2425 .may_unmap = 1, 2440 .may_unmap = 1,
2426 .may_swap = 1, 2441 .may_swap = 1,
2427 /* 2442 /*
2428 * kswapd doesn't want to be bailed out while reclaim. because 2443 * kswapd doesn't want to be bailed out while reclaim. because
2429 * we want to put equal scanning pressure on each zone. 2444 * we want to put equal scanning pressure on each zone.
2430 */ 2445 */
2431 .nr_to_reclaim = ULONG_MAX, 2446 .nr_to_reclaim = ULONG_MAX,
2432 .swappiness = vm_swappiness, 2447 .swappiness = vm_swappiness,
2433 .order = order, 2448 .order = order,
2434 .mem_cgroup = NULL, 2449 .mem_cgroup = NULL,
2435 }; 2450 };
2436 struct shrink_control shrink = { 2451 struct shrink_control shrink = {
2437 .gfp_mask = sc.gfp_mask, 2452 .gfp_mask = sc.gfp_mask,
2438 }; 2453 };
2439 loop_again: 2454 loop_again:
2440 total_scanned = 0; 2455 total_scanned = 0;
2441 sc.nr_reclaimed = 0; 2456 sc.nr_reclaimed = 0;
2442 sc.may_writepage = !laptop_mode; 2457 sc.may_writepage = !laptop_mode;
2443 count_vm_event(PAGEOUTRUN); 2458 count_vm_event(PAGEOUTRUN);
2444 2459
2445 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2460 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2446 unsigned long lru_pages = 0; 2461 unsigned long lru_pages = 0;
2447 int has_under_min_watermark_zone = 0; 2462 int has_under_min_watermark_zone = 0;
2448 2463
2449 /* The swap token gets in the way of swapout... */ 2464 /* The swap token gets in the way of swapout... */
2450 if (!priority) 2465 if (!priority)
2451 disable_swap_token(NULL); 2466 disable_swap_token(NULL);
2452 2467
2453 all_zones_ok = 1; 2468 all_zones_ok = 1;
2454 balanced = 0; 2469 balanced = 0;
2455 2470
2456 /* 2471 /*
2457 * Scan in the highmem->dma direction for the highest 2472 * Scan in the highmem->dma direction for the highest
2458 * zone which needs scanning 2473 * zone which needs scanning
2459 */ 2474 */
2460 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 2475 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2461 struct zone *zone = pgdat->node_zones + i; 2476 struct zone *zone = pgdat->node_zones + i;
2462 2477
2463 if (!populated_zone(zone)) 2478 if (!populated_zone(zone))
2464 continue; 2479 continue;
2465 2480
2466 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2481 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2467 continue; 2482 continue;
2468 2483
2469 /* 2484 /*
2470 * Do some background aging of the anon list, to give 2485 * Do some background aging of the anon list, to give
2471 * pages a chance to be referenced before reclaiming. 2486 * pages a chance to be referenced before reclaiming.
2472 */ 2487 */
2473 if (inactive_anon_is_low(zone, &sc)) 2488 if (inactive_anon_is_low(zone, &sc))
2474 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2489 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2475 &sc, priority, 0); 2490 &sc, priority, 0);
2476 2491
2477 if (!zone_watermark_ok_safe(zone, order, 2492 if (!zone_watermark_ok_safe(zone, order,
2478 high_wmark_pages(zone), 0, 0)) { 2493 high_wmark_pages(zone), 0, 0)) {
2479 end_zone = i; 2494 end_zone = i;
2480 break; 2495 break;
2481 } 2496 }
2482 } 2497 }
2483 if (i < 0) 2498 if (i < 0)
2484 goto out; 2499 goto out;
2485 2500
2486 for (i = 0; i <= end_zone; i++) { 2501 for (i = 0; i <= end_zone; i++) {
2487 struct zone *zone = pgdat->node_zones + i; 2502 struct zone *zone = pgdat->node_zones + i;
2488 2503
2489 lru_pages += zone_reclaimable_pages(zone); 2504 lru_pages += zone_reclaimable_pages(zone);
2490 } 2505 }
2491 2506
2492 /* 2507 /*
2493 * Now scan the zone in the dma->highmem direction, stopping 2508 * Now scan the zone in the dma->highmem direction, stopping
2494 * at the last zone which needs scanning. 2509 * at the last zone which needs scanning.
2495 * 2510 *
2496 * We do this because the page allocator works in the opposite 2511 * We do this because the page allocator works in the opposite
2497 * direction. This prevents the page allocator from allocating 2512 * direction. This prevents the page allocator from allocating
2498 * pages behind kswapd's direction of progress, which would 2513 * pages behind kswapd's direction of progress, which would
2499 * cause too much scanning of the lower zones. 2514 * cause too much scanning of the lower zones.
2500 */ 2515 */
2501 for (i = 0; i <= end_zone; i++) { 2516 for (i = 0; i <= end_zone; i++) {
2502 struct zone *zone = pgdat->node_zones + i; 2517 struct zone *zone = pgdat->node_zones + i;
2503 int nr_slab; 2518 int nr_slab;
2504 unsigned long balance_gap; 2519 unsigned long balance_gap;
2505 2520
2506 if (!populated_zone(zone)) 2521 if (!populated_zone(zone))
2507 continue; 2522 continue;
2508 2523
2509 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2524 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2510 continue; 2525 continue;
2511 2526
2512 sc.nr_scanned = 0; 2527 sc.nr_scanned = 0;
2513 2528
2514 nr_soft_scanned = 0; 2529 nr_soft_scanned = 0;
2515 /* 2530 /*
2516 * Call soft limit reclaim before calling shrink_zone. 2531 * Call soft limit reclaim before calling shrink_zone.
2517 */ 2532 */
2518 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2533 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2519 order, sc.gfp_mask, 2534 order, sc.gfp_mask,
2520 &nr_soft_scanned); 2535 &nr_soft_scanned);
2521 sc.nr_reclaimed += nr_soft_reclaimed; 2536 sc.nr_reclaimed += nr_soft_reclaimed;
2522 total_scanned += nr_soft_scanned; 2537 total_scanned += nr_soft_scanned;
2523 2538
2524 /* 2539 /*
2525 * We put equal pressure on every zone, unless 2540 * We put equal pressure on every zone, unless
2526 * one zone has way too many pages free 2541 * one zone has way too many pages free
2527 * already. The "too many pages" is defined 2542 * already. The "too many pages" is defined
2528 * as the high wmark plus a "gap" where the 2543 * as the high wmark plus a "gap" where the
2529 * gap is either the low watermark or 1% 2544 * gap is either the low watermark or 1%
2530 * of the zone, whichever is smaller. 2545 * of the zone, whichever is smaller.
2531 */ 2546 */
2532 balance_gap = min(low_wmark_pages(zone), 2547 balance_gap = min(low_wmark_pages(zone),
2533 (zone->present_pages + 2548 (zone->present_pages +
2534 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2549 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2535 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2550 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2536 if (!zone_watermark_ok_safe(zone, order, 2551 if (!zone_watermark_ok_safe(zone, order,
2537 high_wmark_pages(zone) + balance_gap, 2552 high_wmark_pages(zone) + balance_gap,
2538 end_zone, 0)) { 2553 end_zone, 0)) {
2539 shrink_zone(priority, zone, &sc); 2554 shrink_zone(priority, zone, &sc);
2540 2555
2541 reclaim_state->reclaimed_slab = 0; 2556 reclaim_state->reclaimed_slab = 0;
2542 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2557 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2543 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2558 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2544 total_scanned += sc.nr_scanned; 2559 total_scanned += sc.nr_scanned;
2545 2560
2546 if (nr_slab == 0 && !zone_reclaimable(zone)) 2561 if (nr_slab == 0 && !zone_reclaimable(zone))
2547 zone->all_unreclaimable = 1; 2562 zone->all_unreclaimable = 1;
2548 } 2563 }
2549 2564
2550 /* 2565 /*
2551 * If we've done a decent amount of scanning and 2566 * If we've done a decent amount of scanning and
2552 * the reclaim ratio is low, start doing writepage 2567 * the reclaim ratio is low, start doing writepage
2553 * even in laptop mode 2568 * even in laptop mode
2554 */ 2569 */
2555 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2570 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2556 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2571 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2557 sc.may_writepage = 1; 2572 sc.may_writepage = 1;
2558 2573
2559 if (zone->all_unreclaimable) { 2574 if (zone->all_unreclaimable) {
2560 if (end_zone && end_zone == i) 2575 if (end_zone && end_zone == i)
2561 end_zone--; 2576 end_zone--;
2562 continue; 2577 continue;
2563 } 2578 }
2564 2579
2565 if (!zone_watermark_ok_safe(zone, order, 2580 if (!zone_watermark_ok_safe(zone, order,
2566 high_wmark_pages(zone), end_zone, 0)) { 2581 high_wmark_pages(zone), end_zone, 0)) {
2567 all_zones_ok = 0; 2582 all_zones_ok = 0;
2568 /* 2583 /*
2569 * We are still under min water mark. This 2584 * We are still under min water mark. This
2570 * means that we have a GFP_ATOMIC allocation 2585 * means that we have a GFP_ATOMIC allocation
2571 * failure risk. Hurry up! 2586 * failure risk. Hurry up!
2572 */ 2587 */
2573 if (!zone_watermark_ok_safe(zone, order, 2588 if (!zone_watermark_ok_safe(zone, order,
2574 min_wmark_pages(zone), end_zone, 0)) 2589 min_wmark_pages(zone), end_zone, 0))
2575 has_under_min_watermark_zone = 1; 2590 has_under_min_watermark_zone = 1;
2576 } else { 2591 } else {
2577 /* 2592 /*
2578 * If a zone reaches its high watermark, 2593 * If a zone reaches its high watermark,
2579 * consider it to be no longer congested. It's 2594 * consider it to be no longer congested. It's
2580 * possible there are dirty pages backed by 2595 * possible there are dirty pages backed by
2581 * congested BDIs but as pressure is relieved, 2596 * congested BDIs but as pressure is relieved,
2582 * spectulatively avoid congestion waits 2597 * spectulatively avoid congestion waits
2583 */ 2598 */
2584 zone_clear_flag(zone, ZONE_CONGESTED); 2599 zone_clear_flag(zone, ZONE_CONGESTED);
2585 if (i <= *classzone_idx) 2600 if (i <= *classzone_idx)
2586 balanced += zone->present_pages; 2601 balanced += zone->present_pages;
2587 } 2602 }
2588 2603
2589 } 2604 }
2590 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2605 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2591 break; /* kswapd: all done */ 2606 break; /* kswapd: all done */
2592 /* 2607 /*
2593 * OK, kswapd is getting into trouble. Take a nap, then take 2608 * OK, kswapd is getting into trouble. Take a nap, then take
2594 * another pass across the zones. 2609 * another pass across the zones.
2595 */ 2610 */
2596 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2611 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2597 if (has_under_min_watermark_zone) 2612 if (has_under_min_watermark_zone)
2598 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2613 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2599 else 2614 else
2600 congestion_wait(BLK_RW_ASYNC, HZ/10); 2615 congestion_wait(BLK_RW_ASYNC, HZ/10);
2601 } 2616 }
2602 2617
2603 /* 2618 /*
2604 * We do this so kswapd doesn't build up large priorities for 2619 * We do this so kswapd doesn't build up large priorities for
2605 * example when it is freeing in parallel with allocators. It 2620 * example when it is freeing in parallel with allocators. It
2606 * matches the direct reclaim path behaviour in terms of impact 2621 * matches the direct reclaim path behaviour in terms of impact
2607 * on zone->*_priority. 2622 * on zone->*_priority.
2608 */ 2623 */
2609 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2624 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2610 break; 2625 break;
2611 } 2626 }
2612 out: 2627 out:
2613 2628
2614 /* 2629 /*
2615 * order-0: All zones must meet high watermark for a balanced node 2630 * order-0: All zones must meet high watermark for a balanced node
2616 * high-order: Balanced zones must make up at least 25% of the node 2631 * high-order: Balanced zones must make up at least 25% of the node
2617 * for the node to be balanced 2632 * for the node to be balanced
2618 */ 2633 */
2619 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { 2634 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2620 cond_resched(); 2635 cond_resched();
2621 2636
2622 try_to_freeze(); 2637 try_to_freeze();
2623 2638
2624 /* 2639 /*
2625 * Fragmentation may mean that the system cannot be 2640 * Fragmentation may mean that the system cannot be
2626 * rebalanced for high-order allocations in all zones. 2641 * rebalanced for high-order allocations in all zones.
2627 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, 2642 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2628 * it means the zones have been fully scanned and are still 2643 * it means the zones have been fully scanned and are still
2629 * not balanced. For high-order allocations, there is 2644 * not balanced. For high-order allocations, there is
2630 * little point trying all over again as kswapd may 2645 * little point trying all over again as kswapd may
2631 * infinite loop. 2646 * infinite loop.
2632 * 2647 *
2633 * Instead, recheck all watermarks at order-0 as they 2648 * Instead, recheck all watermarks at order-0 as they
2634 * are the most important. If watermarks are ok, kswapd will go 2649 * are the most important. If watermarks are ok, kswapd will go
2635 * back to sleep. High-order users can still perform direct 2650 * back to sleep. High-order users can still perform direct
2636 * reclaim if they wish. 2651 * reclaim if they wish.
2637 */ 2652 */
2638 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 2653 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2639 order = sc.order = 0; 2654 order = sc.order = 0;
2640 2655
2641 goto loop_again; 2656 goto loop_again;
2642 } 2657 }
2643 2658
2644 /* 2659 /*
2645 * If kswapd was reclaiming at a higher order, it has the option of 2660 * If kswapd was reclaiming at a higher order, it has the option of
2646 * sleeping without all zones being balanced. Before it does, it must 2661 * sleeping without all zones being balanced. Before it does, it must
2647 * ensure that the watermarks for order-0 on *all* zones are met and 2662 * ensure that the watermarks for order-0 on *all* zones are met and
2648 * that the congestion flags are cleared. The congestion flag must 2663 * that the congestion flags are cleared. The congestion flag must
2649 * be cleared as kswapd is the only mechanism that clears the flag 2664 * be cleared as kswapd is the only mechanism that clears the flag
2650 * and it is potentially going to sleep here. 2665 * and it is potentially going to sleep here.
2651 */ 2666 */
2652 if (order) { 2667 if (order) {
2653 for (i = 0; i <= end_zone; i++) { 2668 for (i = 0; i <= end_zone; i++) {
2654 struct zone *zone = pgdat->node_zones + i; 2669 struct zone *zone = pgdat->node_zones + i;
2655 2670
2656 if (!populated_zone(zone)) 2671 if (!populated_zone(zone))
2657 continue; 2672 continue;
2658 2673
2659 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2674 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2660 continue; 2675 continue;
2661 2676
2662 /* Confirm the zone is balanced for order-0 */ 2677 /* Confirm the zone is balanced for order-0 */
2663 if (!zone_watermark_ok(zone, 0, 2678 if (!zone_watermark_ok(zone, 0,
2664 high_wmark_pages(zone), 0, 0)) { 2679 high_wmark_pages(zone), 0, 0)) {
2665 order = sc.order = 0; 2680 order = sc.order = 0;
2666 goto loop_again; 2681 goto loop_again;
2667 } 2682 }
2668 2683
2669 /* If balanced, clear the congested flag */ 2684 /* If balanced, clear the congested flag */
2670 zone_clear_flag(zone, ZONE_CONGESTED); 2685 zone_clear_flag(zone, ZONE_CONGESTED);
2671 } 2686 }
2672 } 2687 }
2673 2688
2674 /* 2689 /*
2675 * Return the order we were reclaiming at so sleeping_prematurely() 2690 * Return the order we were reclaiming at so sleeping_prematurely()
2676 * makes a decision on the order we were last reclaiming at. However, 2691 * makes a decision on the order we were last reclaiming at. However,
2677 * if another caller entered the allocator slow path while kswapd 2692 * if another caller entered the allocator slow path while kswapd
2678 * was awake, order will remain at the higher level 2693 * was awake, order will remain at the higher level
2679 */ 2694 */
2680 *classzone_idx = end_zone; 2695 *classzone_idx = end_zone;
2681 return order; 2696 return order;
2682 } 2697 }
2683 2698
2684 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 2699 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2685 { 2700 {
2686 long remaining = 0; 2701 long remaining = 0;
2687 DEFINE_WAIT(wait); 2702 DEFINE_WAIT(wait);
2688 2703
2689 if (freezing(current) || kthread_should_stop()) 2704 if (freezing(current) || kthread_should_stop())
2690 return; 2705 return;
2691 2706
2692 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2707 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2693 2708
2694 /* Try to sleep for a short interval */ 2709 /* Try to sleep for a short interval */
2695 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2710 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2696 remaining = schedule_timeout(HZ/10); 2711 remaining = schedule_timeout(HZ/10);
2697 finish_wait(&pgdat->kswapd_wait, &wait); 2712 finish_wait(&pgdat->kswapd_wait, &wait);
2698 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2713 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2699 } 2714 }
2700 2715
2701 /* 2716 /*
2702 * After a short sleep, check if it was a premature sleep. If not, then 2717 * After a short sleep, check if it was a premature sleep. If not, then
2703 * go fully to sleep until explicitly woken up. 2718 * go fully to sleep until explicitly woken up.
2704 */ 2719 */
2705 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2720 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2706 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2721 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2707 2722
2708 /* 2723 /*
2709 * vmstat counters are not perfectly accurate and the estimated 2724 * vmstat counters are not perfectly accurate and the estimated
2710 * value for counters such as NR_FREE_PAGES can deviate from the 2725 * value for counters such as NR_FREE_PAGES can deviate from the
2711 * true value by nr_online_cpus * threshold. To avoid the zone 2726 * true value by nr_online_cpus * threshold. To avoid the zone
2712 * watermarks being breached while under pressure, we reduce the 2727 * watermarks being breached while under pressure, we reduce the
2713 * per-cpu vmstat threshold while kswapd is awake and restore 2728 * per-cpu vmstat threshold while kswapd is awake and restore
2714 * them before going back to sleep. 2729 * them before going back to sleep.
2715 */ 2730 */
2716 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2731 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2717 schedule(); 2732 schedule();
2718 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2733 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2719 } else { 2734 } else {
2720 if (remaining) 2735 if (remaining)
2721 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2736 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2722 else 2737 else
2723 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 2738 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2724 } 2739 }
2725 finish_wait(&pgdat->kswapd_wait, &wait); 2740 finish_wait(&pgdat->kswapd_wait, &wait);
2726 } 2741 }
2727 2742
2728 /* 2743 /*
2729 * The background pageout daemon, started as a kernel thread 2744 * The background pageout daemon, started as a kernel thread
2730 * from the init process. 2745 * from the init process.
2731 * 2746 *
2732 * This basically trickles out pages so that we have _some_ 2747 * This basically trickles out pages so that we have _some_
2733 * free memory available even if there is no other activity 2748 * free memory available even if there is no other activity
2734 * that frees anything up. This is needed for things like routing 2749 * that frees anything up. This is needed for things like routing
2735 * etc, where we otherwise might have all activity going on in 2750 * etc, where we otherwise might have all activity going on in
2736 * asynchronous contexts that cannot page things out. 2751 * asynchronous contexts that cannot page things out.
2737 * 2752 *
2738 * If there are applications that are active memory-allocators 2753 * If there are applications that are active memory-allocators
2739 * (most normal use), this basically shouldn't matter. 2754 * (most normal use), this basically shouldn't matter.
2740 */ 2755 */
2741 static int kswapd(void *p) 2756 static int kswapd(void *p)
2742 { 2757 {
2743 unsigned long order, new_order; 2758 unsigned long order, new_order;
2744 int classzone_idx, new_classzone_idx; 2759 int classzone_idx, new_classzone_idx;
2745 pg_data_t *pgdat = (pg_data_t*)p; 2760 pg_data_t *pgdat = (pg_data_t*)p;
2746 struct task_struct *tsk = current; 2761 struct task_struct *tsk = current;
2747 2762
2748 struct reclaim_state reclaim_state = { 2763 struct reclaim_state reclaim_state = {
2749 .reclaimed_slab = 0, 2764 .reclaimed_slab = 0,
2750 }; 2765 };
2751 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2766 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2752 2767
2753 lockdep_set_current_reclaim_state(GFP_KERNEL); 2768 lockdep_set_current_reclaim_state(GFP_KERNEL);
2754 2769
2755 if (!cpumask_empty(cpumask)) 2770 if (!cpumask_empty(cpumask))
2756 set_cpus_allowed_ptr(tsk, cpumask); 2771 set_cpus_allowed_ptr(tsk, cpumask);
2757 current->reclaim_state = &reclaim_state; 2772 current->reclaim_state = &reclaim_state;
2758 2773
2759 /* 2774 /*
2760 * Tell the memory management that we're a "memory allocator", 2775 * Tell the memory management that we're a "memory allocator",
2761 * and that if we need more memory we should get access to it 2776 * and that if we need more memory we should get access to it
2762 * regardless (see "__alloc_pages()"). "kswapd" should 2777 * regardless (see "__alloc_pages()"). "kswapd" should
2763 * never get caught in the normal page freeing logic. 2778 * never get caught in the normal page freeing logic.
2764 * 2779 *
2765 * (Kswapd normally doesn't need memory anyway, but sometimes 2780 * (Kswapd normally doesn't need memory anyway, but sometimes
2766 * you need a small amount of memory in order to be able to 2781 * you need a small amount of memory in order to be able to
2767 * page out something else, and this flag essentially protects 2782 * page out something else, and this flag essentially protects
2768 * us from recursively trying to free more memory as we're 2783 * us from recursively trying to free more memory as we're
2769 * trying to free the first piece of memory in the first place). 2784 * trying to free the first piece of memory in the first place).
2770 */ 2785 */
2771 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2786 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2772 set_freezable(); 2787 set_freezable();
2773 2788
2774 order = new_order = 0; 2789 order = new_order = 0;
2775 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2790 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2776 for ( ; ; ) { 2791 for ( ; ; ) {
2777 int ret; 2792 int ret;
2778 2793
2779 /* 2794 /*
2780 * If the last balance_pgdat was unsuccessful it's unlikely a 2795 * If the last balance_pgdat was unsuccessful it's unlikely a
2781 * new request of a similar or harder type will succeed soon 2796 * new request of a similar or harder type will succeed soon
2782 * so consider going to sleep on the basis we reclaimed at 2797 * so consider going to sleep on the basis we reclaimed at
2783 */ 2798 */
2784 if (classzone_idx >= new_classzone_idx && order == new_order) { 2799 if (classzone_idx >= new_classzone_idx && order == new_order) {
2785 new_order = pgdat->kswapd_max_order; 2800 new_order = pgdat->kswapd_max_order;
2786 new_classzone_idx = pgdat->classzone_idx; 2801 new_classzone_idx = pgdat->classzone_idx;
2787 pgdat->kswapd_max_order = 0; 2802 pgdat->kswapd_max_order = 0;
2788 pgdat->classzone_idx = pgdat->nr_zones - 1; 2803 pgdat->classzone_idx = pgdat->nr_zones - 1;
2789 } 2804 }
2790 2805
2791 if (order < new_order || classzone_idx > new_classzone_idx) { 2806 if (order < new_order || classzone_idx > new_classzone_idx) {
2792 /* 2807 /*
2793 * Don't sleep if someone wants a larger 'order' 2808 * Don't sleep if someone wants a larger 'order'
2794 * allocation or has tigher zone constraints 2809 * allocation or has tigher zone constraints
2795 */ 2810 */
2796 order = new_order; 2811 order = new_order;
2797 classzone_idx = new_classzone_idx; 2812 classzone_idx = new_classzone_idx;
2798 } else { 2813 } else {
2799 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2814 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2800 order = pgdat->kswapd_max_order; 2815 order = pgdat->kswapd_max_order;
2801 classzone_idx = pgdat->classzone_idx; 2816 classzone_idx = pgdat->classzone_idx;
2802 pgdat->kswapd_max_order = 0; 2817 pgdat->kswapd_max_order = 0;
2803 pgdat->classzone_idx = pgdat->nr_zones - 1; 2818 pgdat->classzone_idx = pgdat->nr_zones - 1;
2804 } 2819 }
2805 2820
2806 ret = try_to_freeze(); 2821 ret = try_to_freeze();
2807 if (kthread_should_stop()) 2822 if (kthread_should_stop())
2808 break; 2823 break;
2809 2824
2810 /* 2825 /*
2811 * We can speed up thawing tasks if we don't call balance_pgdat 2826 * We can speed up thawing tasks if we don't call balance_pgdat
2812 * after returning from the refrigerator 2827 * after returning from the refrigerator
2813 */ 2828 */
2814 if (!ret) { 2829 if (!ret) {
2815 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2830 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2816 order = balance_pgdat(pgdat, order, &classzone_idx); 2831 order = balance_pgdat(pgdat, order, &classzone_idx);
2817 } 2832 }
2818 } 2833 }
2819 return 0; 2834 return 0;
2820 } 2835 }
2821 2836
2822 /* 2837 /*
2823 * A zone is low on free memory, so wake its kswapd task to service it. 2838 * A zone is low on free memory, so wake its kswapd task to service it.
2824 */ 2839 */
2825 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 2840 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2826 { 2841 {
2827 pg_data_t *pgdat; 2842 pg_data_t *pgdat;
2828 2843
2829 if (!populated_zone(zone)) 2844 if (!populated_zone(zone))
2830 return; 2845 return;
2831 2846
2832 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2847 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2833 return; 2848 return;
2834 pgdat = zone->zone_pgdat; 2849 pgdat = zone->zone_pgdat;
2835 if (pgdat->kswapd_max_order < order) { 2850 if (pgdat->kswapd_max_order < order) {
2836 pgdat->kswapd_max_order = order; 2851 pgdat->kswapd_max_order = order;
2837 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 2852 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2838 } 2853 }
2839 if (!waitqueue_active(&pgdat->kswapd_wait)) 2854 if (!waitqueue_active(&pgdat->kswapd_wait))
2840 return; 2855 return;
2841 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) 2856 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2842 return; 2857 return;
2843 2858
2844 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 2859 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2845 wake_up_interruptible(&pgdat->kswapd_wait); 2860 wake_up_interruptible(&pgdat->kswapd_wait);
2846 } 2861 }
2847 2862
2848 /* 2863 /*
2849 * The reclaimable count would be mostly accurate. 2864 * The reclaimable count would be mostly accurate.
2850 * The less reclaimable pages may be 2865 * The less reclaimable pages may be
2851 * - mlocked pages, which will be moved to unevictable list when encountered 2866 * - mlocked pages, which will be moved to unevictable list when encountered
2852 * - mapped pages, which may require several travels to be reclaimed 2867 * - mapped pages, which may require several travels to be reclaimed
2853 * - dirty pages, which is not "instantly" reclaimable 2868 * - dirty pages, which is not "instantly" reclaimable
2854 */ 2869 */
2855 unsigned long global_reclaimable_pages(void) 2870 unsigned long global_reclaimable_pages(void)
2856 { 2871 {
2857 int nr; 2872 int nr;
2858 2873
2859 nr = global_page_state(NR_ACTIVE_FILE) + 2874 nr = global_page_state(NR_ACTIVE_FILE) +
2860 global_page_state(NR_INACTIVE_FILE); 2875 global_page_state(NR_INACTIVE_FILE);
2861 2876
2862 if (nr_swap_pages > 0) 2877 if (nr_swap_pages > 0)
2863 nr += global_page_state(NR_ACTIVE_ANON) + 2878 nr += global_page_state(NR_ACTIVE_ANON) +
2864 global_page_state(NR_INACTIVE_ANON); 2879 global_page_state(NR_INACTIVE_ANON);
2865 2880
2866 return nr; 2881 return nr;
2867 } 2882 }
2868 2883
2869 unsigned long zone_reclaimable_pages(struct zone *zone) 2884 unsigned long zone_reclaimable_pages(struct zone *zone)
2870 { 2885 {
2871 int nr; 2886 int nr;
2872 2887
2873 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 2888 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2874 zone_page_state(zone, NR_INACTIVE_FILE); 2889 zone_page_state(zone, NR_INACTIVE_FILE);
2875 2890
2876 if (nr_swap_pages > 0) 2891 if (nr_swap_pages > 0)
2877 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 2892 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2878 zone_page_state(zone, NR_INACTIVE_ANON); 2893 zone_page_state(zone, NR_INACTIVE_ANON);
2879 2894
2880 return nr; 2895 return nr;
2881 } 2896 }
2882 2897
2883 #ifdef CONFIG_HIBERNATION 2898 #ifdef CONFIG_HIBERNATION
2884 /* 2899 /*
2885 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 2900 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2886 * freed pages. 2901 * freed pages.
2887 * 2902 *
2888 * Rather than trying to age LRUs the aim is to preserve the overall 2903 * Rather than trying to age LRUs the aim is to preserve the overall
2889 * LRU order by reclaiming preferentially 2904 * LRU order by reclaiming preferentially
2890 * inactive > active > active referenced > active mapped 2905 * inactive > active > active referenced > active mapped
2891 */ 2906 */
2892 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 2907 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2893 { 2908 {
2894 struct reclaim_state reclaim_state; 2909 struct reclaim_state reclaim_state;
2895 struct scan_control sc = { 2910 struct scan_control sc = {
2896 .gfp_mask = GFP_HIGHUSER_MOVABLE, 2911 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2897 .may_swap = 1, 2912 .may_swap = 1,
2898 .may_unmap = 1, 2913 .may_unmap = 1,
2899 .may_writepage = 1, 2914 .may_writepage = 1,
2900 .nr_to_reclaim = nr_to_reclaim, 2915 .nr_to_reclaim = nr_to_reclaim,
2901 .hibernation_mode = 1, 2916 .hibernation_mode = 1,
2902 .swappiness = vm_swappiness, 2917 .swappiness = vm_swappiness,
2903 .order = 0, 2918 .order = 0,
2904 }; 2919 };
2905 struct shrink_control shrink = { 2920 struct shrink_control shrink = {
2906 .gfp_mask = sc.gfp_mask, 2921 .gfp_mask = sc.gfp_mask,
2907 }; 2922 };
2908 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2923 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2909 struct task_struct *p = current; 2924 struct task_struct *p = current;
2910 unsigned long nr_reclaimed; 2925 unsigned long nr_reclaimed;
2911 2926
2912 p->flags |= PF_MEMALLOC; 2927 p->flags |= PF_MEMALLOC;
2913 lockdep_set_current_reclaim_state(sc.gfp_mask); 2928 lockdep_set_current_reclaim_state(sc.gfp_mask);
2914 reclaim_state.reclaimed_slab = 0; 2929 reclaim_state.reclaimed_slab = 0;
2915 p->reclaim_state = &reclaim_state; 2930 p->reclaim_state = &reclaim_state;
2916 2931
2917 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2932 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2918 2933
2919 p->reclaim_state = NULL; 2934 p->reclaim_state = NULL;
2920 lockdep_clear_current_reclaim_state(); 2935 lockdep_clear_current_reclaim_state();
2921 p->flags &= ~PF_MEMALLOC; 2936 p->flags &= ~PF_MEMALLOC;
2922 2937
2923 return nr_reclaimed; 2938 return nr_reclaimed;
2924 } 2939 }
2925 #endif /* CONFIG_HIBERNATION */ 2940 #endif /* CONFIG_HIBERNATION */
2926 2941
2927 /* It's optimal to keep kswapds on the same CPUs as their memory, but 2942 /* It's optimal to keep kswapds on the same CPUs as their memory, but
2928 not required for correctness. So if the last cpu in a node goes 2943 not required for correctness. So if the last cpu in a node goes
2929 away, we get changed to run anywhere: as the first one comes back, 2944 away, we get changed to run anywhere: as the first one comes back,
2930 restore their cpu bindings. */ 2945 restore their cpu bindings. */
2931 static int __devinit cpu_callback(struct notifier_block *nfb, 2946 static int __devinit cpu_callback(struct notifier_block *nfb,
2932 unsigned long action, void *hcpu) 2947 unsigned long action, void *hcpu)
2933 { 2948 {
2934 int nid; 2949 int nid;
2935 2950
2936 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 2951 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2937 for_each_node_state(nid, N_HIGH_MEMORY) { 2952 for_each_node_state(nid, N_HIGH_MEMORY) {
2938 pg_data_t *pgdat = NODE_DATA(nid); 2953 pg_data_t *pgdat = NODE_DATA(nid);
2939 const struct cpumask *mask; 2954 const struct cpumask *mask;
2940 2955
2941 mask = cpumask_of_node(pgdat->node_id); 2956 mask = cpumask_of_node(pgdat->node_id);
2942 2957
2943 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 2958 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2944 /* One of our CPUs online: restore mask */ 2959 /* One of our CPUs online: restore mask */
2945 set_cpus_allowed_ptr(pgdat->kswapd, mask); 2960 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2946 } 2961 }
2947 } 2962 }
2948 return NOTIFY_OK; 2963 return NOTIFY_OK;
2949 } 2964 }
2950 2965
2951 /* 2966 /*
2952 * This kswapd start function will be called by init and node-hot-add. 2967 * This kswapd start function will be called by init and node-hot-add.
2953 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 2968 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
2954 */ 2969 */
2955 int kswapd_run(int nid) 2970 int kswapd_run(int nid)
2956 { 2971 {
2957 pg_data_t *pgdat = NODE_DATA(nid); 2972 pg_data_t *pgdat = NODE_DATA(nid);
2958 int ret = 0; 2973 int ret = 0;
2959 2974
2960 if (pgdat->kswapd) 2975 if (pgdat->kswapd)
2961 return 0; 2976 return 0;
2962 2977
2963 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 2978 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2964 if (IS_ERR(pgdat->kswapd)) { 2979 if (IS_ERR(pgdat->kswapd)) {
2965 /* failure at boot is fatal */ 2980 /* failure at boot is fatal */
2966 BUG_ON(system_state == SYSTEM_BOOTING); 2981 BUG_ON(system_state == SYSTEM_BOOTING);
2967 printk("Failed to start kswapd on node %d\n",nid); 2982 printk("Failed to start kswapd on node %d\n",nid);
2968 ret = -1; 2983 ret = -1;
2969 } 2984 }
2970 return ret; 2985 return ret;
2971 } 2986 }
2972 2987
2973 /* 2988 /*
2974 * Called by memory hotplug when all memory in a node is offlined. 2989 * Called by memory hotplug when all memory in a node is offlined.
2975 */ 2990 */
2976 void kswapd_stop(int nid) 2991 void kswapd_stop(int nid)
2977 { 2992 {
2978 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 2993 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2979 2994
2980 if (kswapd) 2995 if (kswapd)
2981 kthread_stop(kswapd); 2996 kthread_stop(kswapd);
2982 } 2997 }
2983 2998
2984 static int __init kswapd_init(void) 2999 static int __init kswapd_init(void)
2985 { 3000 {
2986 int nid; 3001 int nid;
2987 3002
2988 swap_setup(); 3003 swap_setup();
2989 for_each_node_state(nid, N_HIGH_MEMORY) 3004 for_each_node_state(nid, N_HIGH_MEMORY)
2990 kswapd_run(nid); 3005 kswapd_run(nid);
2991 hotcpu_notifier(cpu_callback, 0); 3006 hotcpu_notifier(cpu_callback, 0);
2992 return 0; 3007 return 0;
2993 } 3008 }
2994 3009
2995 module_init(kswapd_init) 3010 module_init(kswapd_init)
2996 3011
2997 #ifdef CONFIG_NUMA 3012 #ifdef CONFIG_NUMA
2998 /* 3013 /*
2999 * Zone reclaim mode 3014 * Zone reclaim mode
3000 * 3015 *
3001 * If non-zero call zone_reclaim when the number of free pages falls below 3016 * If non-zero call zone_reclaim when the number of free pages falls below
3002 * the watermarks. 3017 * the watermarks.
3003 */ 3018 */
3004 int zone_reclaim_mode __read_mostly; 3019 int zone_reclaim_mode __read_mostly;
3005 3020
3006 #define RECLAIM_OFF 0 3021 #define RECLAIM_OFF 0
3007 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3022 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
3008 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3023 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
3009 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3024 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
3010 3025
3011 /* 3026 /*
3012 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3027 * Priority for ZONE_RECLAIM. This determines the fraction of pages
3013 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3028 * of a node considered for each zone_reclaim. 4 scans 1/16th of
3014 * a zone. 3029 * a zone.
3015 */ 3030 */
3016 #define ZONE_RECLAIM_PRIORITY 4 3031 #define ZONE_RECLAIM_PRIORITY 4
3017 3032
3018 /* 3033 /*
3019 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3034 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
3020 * occur. 3035 * occur.
3021 */ 3036 */
3022 int sysctl_min_unmapped_ratio = 1; 3037 int sysctl_min_unmapped_ratio = 1;
3023 3038
3024 /* 3039 /*
3025 * If the number of slab pages in a zone grows beyond this percentage then 3040 * If the number of slab pages in a zone grows beyond this percentage then
3026 * slab reclaim needs to occur. 3041 * slab reclaim needs to occur.
3027 */ 3042 */
3028 int sysctl_min_slab_ratio = 5; 3043 int sysctl_min_slab_ratio = 5;
3029 3044
3030 static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3045 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3031 { 3046 {
3032 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3047 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3033 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3048 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3034 zone_page_state(zone, NR_ACTIVE_FILE); 3049 zone_page_state(zone, NR_ACTIVE_FILE);
3035 3050
3036 /* 3051 /*
3037 * It's possible for there to be more file mapped pages than 3052 * It's possible for there to be more file mapped pages than
3038 * accounted for by the pages on the file LRU lists because 3053 * accounted for by the pages on the file LRU lists because
3039 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3054 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
3040 */ 3055 */
3041 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3056 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3042 } 3057 }
3043 3058
3044 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3059 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
3045 static long zone_pagecache_reclaimable(struct zone *zone) 3060 static long zone_pagecache_reclaimable(struct zone *zone)
3046 { 3061 {
3047 long nr_pagecache_reclaimable; 3062 long nr_pagecache_reclaimable;
3048 long delta = 0; 3063 long delta = 0;
3049 3064
3050 /* 3065 /*
3051 * If RECLAIM_SWAP is set, then all file pages are considered 3066 * If RECLAIM_SWAP is set, then all file pages are considered
3052 * potentially reclaimable. Otherwise, we have to worry about 3067 * potentially reclaimable. Otherwise, we have to worry about
3053 * pages like swapcache and zone_unmapped_file_pages() provides 3068 * pages like swapcache and zone_unmapped_file_pages() provides
3054 * a better estimate 3069 * a better estimate
3055 */ 3070 */
3056 if (zone_reclaim_mode & RECLAIM_SWAP) 3071 if (zone_reclaim_mode & RECLAIM_SWAP)
3057 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3072 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3058 else 3073 else
3059 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3074 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3060 3075
3061 /* If we can't clean pages, remove dirty pages from consideration */ 3076 /* If we can't clean pages, remove dirty pages from consideration */
3062 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3077 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3063 delta += zone_page_state(zone, NR_FILE_DIRTY); 3078 delta += zone_page_state(zone, NR_FILE_DIRTY);
3064 3079
3065 /* Watch for any possible underflows due to delta */ 3080 /* Watch for any possible underflows due to delta */
3066 if (unlikely(delta > nr_pagecache_reclaimable)) 3081 if (unlikely(delta > nr_pagecache_reclaimable))
3067 delta = nr_pagecache_reclaimable; 3082 delta = nr_pagecache_reclaimable;
3068 3083
3069 return nr_pagecache_reclaimable - delta; 3084 return nr_pagecache_reclaimable - delta;
3070 } 3085 }
3071 3086
3072 /* 3087 /*
3073 * Try to free up some pages from this zone through reclaim. 3088 * Try to free up some pages from this zone through reclaim.
3074 */ 3089 */
3075 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3090 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3076 { 3091 {
3077 /* Minimum pages needed in order to stay on node */ 3092 /* Minimum pages needed in order to stay on node */
3078 const unsigned long nr_pages = 1 << order; 3093 const unsigned long nr_pages = 1 << order;
3079 struct task_struct *p = current; 3094 struct task_struct *p = current;
3080 struct reclaim_state reclaim_state; 3095 struct reclaim_state reclaim_state;
3081 int priority; 3096 int priority;
3082 struct scan_control sc = { 3097 struct scan_control sc = {
3083 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3098 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3084 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3099 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3085 .may_swap = 1, 3100 .may_swap = 1,
3086 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3101 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3087 SWAP_CLUSTER_MAX), 3102 SWAP_CLUSTER_MAX),
3088 .gfp_mask = gfp_mask, 3103 .gfp_mask = gfp_mask,
3089 .swappiness = vm_swappiness, 3104 .swappiness = vm_swappiness,
3090 .order = order, 3105 .order = order,
3091 }; 3106 };
3092 struct shrink_control shrink = { 3107 struct shrink_control shrink = {
3093 .gfp_mask = sc.gfp_mask, 3108 .gfp_mask = sc.gfp_mask,
3094 }; 3109 };
3095 unsigned long nr_slab_pages0, nr_slab_pages1; 3110 unsigned long nr_slab_pages0, nr_slab_pages1;
3096 3111
3097 cond_resched(); 3112 cond_resched();
3098 /* 3113 /*
3099 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3114 * We need to be able to allocate from the reserves for RECLAIM_SWAP
3100 * and we also need to be able to write out pages for RECLAIM_WRITE 3115 * and we also need to be able to write out pages for RECLAIM_WRITE
3101 * and RECLAIM_SWAP. 3116 * and RECLAIM_SWAP.
3102 */ 3117 */
3103 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3118 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3104 lockdep_set_current_reclaim_state(gfp_mask); 3119 lockdep_set_current_reclaim_state(gfp_mask);
3105 reclaim_state.reclaimed_slab = 0; 3120 reclaim_state.reclaimed_slab = 0;
3106 p->reclaim_state = &reclaim_state; 3121 p->reclaim_state = &reclaim_state;
3107 3122
3108 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3123 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3109 /* 3124 /*
3110 * Free memory by calling shrink zone with increasing 3125 * Free memory by calling shrink zone with increasing
3111 * priorities until we have enough memory freed. 3126 * priorities until we have enough memory freed.
3112 */ 3127 */
3113 priority = ZONE_RECLAIM_PRIORITY; 3128 priority = ZONE_RECLAIM_PRIORITY;
3114 do { 3129 do {
3115 shrink_zone(priority, zone, &sc); 3130 shrink_zone(priority, zone, &sc);
3116 priority--; 3131 priority--;
3117 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 3132 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3118 } 3133 }
3119 3134
3120 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3135 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3121 if (nr_slab_pages0 > zone->min_slab_pages) { 3136 if (nr_slab_pages0 > zone->min_slab_pages) {
3122 /* 3137 /*
3123 * shrink_slab() does not currently allow us to determine how 3138 * shrink_slab() does not currently allow us to determine how
3124 * many pages were freed in this zone. So we take the current 3139 * many pages were freed in this zone. So we take the current
3125 * number of slab pages and shake the slab until it is reduced 3140 * number of slab pages and shake the slab until it is reduced
3126 * by the same nr_pages that we used for reclaiming unmapped 3141 * by the same nr_pages that we used for reclaiming unmapped
3127 * pages. 3142 * pages.
3128 * 3143 *
3129 * Note that shrink_slab will free memory on all zones and may 3144 * Note that shrink_slab will free memory on all zones and may
3130 * take a long time. 3145 * take a long time.
3131 */ 3146 */
3132 for (;;) { 3147 for (;;) {
3133 unsigned long lru_pages = zone_reclaimable_pages(zone); 3148 unsigned long lru_pages = zone_reclaimable_pages(zone);
3134 3149
3135 /* No reclaimable slab or very low memory pressure */ 3150 /* No reclaimable slab or very low memory pressure */
3136 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3151 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3137 break; 3152 break;
3138 3153
3139 /* Freed enough memory */ 3154 /* Freed enough memory */
3140 nr_slab_pages1 = zone_page_state(zone, 3155 nr_slab_pages1 = zone_page_state(zone,
3141 NR_SLAB_RECLAIMABLE); 3156 NR_SLAB_RECLAIMABLE);
3142 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3157 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3143 break; 3158 break;
3144 } 3159 }
3145 3160
3146 /* 3161 /*
3147 * Update nr_reclaimed by the number of slab pages we 3162 * Update nr_reclaimed by the number of slab pages we
3148 * reclaimed from this zone. 3163 * reclaimed from this zone.
3149 */ 3164 */
3150 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3165 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3151 if (nr_slab_pages1 < nr_slab_pages0) 3166 if (nr_slab_pages1 < nr_slab_pages0)
3152 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3167 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3153 } 3168 }
3154 3169
3155 p->reclaim_state = NULL; 3170 p->reclaim_state = NULL;
3156 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3171 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3157 lockdep_clear_current_reclaim_state(); 3172 lockdep_clear_current_reclaim_state();
3158 return sc.nr_reclaimed >= nr_pages; 3173 return sc.nr_reclaimed >= nr_pages;
3159 } 3174 }
3160 3175
3161 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3176 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3162 { 3177 {
3163 int node_id; 3178 int node_id;
3164 int ret; 3179 int ret;
3165 3180
3166 /* 3181 /*
3167 * Zone reclaim reclaims unmapped file backed pages and 3182 * Zone reclaim reclaims unmapped file backed pages and
3168 * slab pages if we are over the defined limits. 3183 * slab pages if we are over the defined limits.
3169 * 3184 *
3170 * A small portion of unmapped file backed pages is needed for 3185 * A small portion of unmapped file backed pages is needed for
3171 * file I/O otherwise pages read by file I/O will be immediately 3186 * file I/O otherwise pages read by file I/O will be immediately
3172 * thrown out if the zone is overallocated. So we do not reclaim 3187 * thrown out if the zone is overallocated. So we do not reclaim
3173 * if less than a specified percentage of the zone is used by 3188 * if less than a specified percentage of the zone is used by
3174 * unmapped file backed pages. 3189 * unmapped file backed pages.
3175 */ 3190 */
3176 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3191 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3177 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3192 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3178 return ZONE_RECLAIM_FULL; 3193 return ZONE_RECLAIM_FULL;
3179 3194
3180 if (zone->all_unreclaimable) 3195 if (zone->all_unreclaimable)
3181 return ZONE_RECLAIM_FULL; 3196 return ZONE_RECLAIM_FULL;
3182 3197
3183 /* 3198 /*
3184 * Do not scan if the allocation should not be delayed. 3199 * Do not scan if the allocation should not be delayed.
3185 */ 3200 */
3186 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3201 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3187 return ZONE_RECLAIM_NOSCAN; 3202 return ZONE_RECLAIM_NOSCAN;
3188 3203
3189 /* 3204 /*
3190 * Only run zone reclaim on the local zone or on zones that do not 3205 * Only run zone reclaim on the local zone or on zones that do not
3191 * have associated processors. This will favor the local processor 3206 * have associated processors. This will favor the local processor
3192 * over remote processors and spread off node memory allocations 3207 * over remote processors and spread off node memory allocations
3193 * as wide as possible. 3208 * as wide as possible.
3194 */ 3209 */
3195 node_id = zone_to_nid(zone); 3210 node_id = zone_to_nid(zone);
3196 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3211 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3197 return ZONE_RECLAIM_NOSCAN; 3212 return ZONE_RECLAIM_NOSCAN;
3198 3213
3199 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3214 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3200 return ZONE_RECLAIM_NOSCAN; 3215 return ZONE_RECLAIM_NOSCAN;
3201 3216
3202 ret = __zone_reclaim(zone, gfp_mask, order); 3217 ret = __zone_reclaim(zone, gfp_mask, order);
3203 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3218 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3204 3219
3205 if (!ret) 3220 if (!ret)
3206 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3221 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3207 3222
3208 return ret; 3223 return ret;
3209 } 3224 }
3210 #endif 3225 #endif
3211 3226
3212 /* 3227 /*
3213 * page_evictable - test whether a page is evictable 3228 * page_evictable - test whether a page is evictable
3214 * @page: the page to test 3229 * @page: the page to test
3215 * @vma: the VMA in which the page is or will be mapped, may be NULL 3230 * @vma: the VMA in which the page is or will be mapped, may be NULL
3216 * 3231 *
3217 * Test whether page is evictable--i.e., should be placed on active/inactive 3232 * Test whether page is evictable--i.e., should be placed on active/inactive
3218 * lists vs unevictable list. The vma argument is !NULL when called from the 3233 * lists vs unevictable list. The vma argument is !NULL when called from the
3219 * fault path to determine how to instantate a new page. 3234 * fault path to determine how to instantate a new page.
3220 * 3235 *
3221 * Reasons page might not be evictable: 3236 * Reasons page might not be evictable:
3222 * (1) page's mapping marked unevictable 3237 * (1) page's mapping marked unevictable
3223 * (2) page is part of an mlocked VMA 3238 * (2) page is part of an mlocked VMA
3224 * 3239 *
3225 */ 3240 */
3226 int page_evictable(struct page *page, struct vm_area_struct *vma) 3241 int page_evictable(struct page *page, struct vm_area_struct *vma)
3227 { 3242 {
3228 3243
3229 if (mapping_unevictable(page_mapping(page))) 3244 if (mapping_unevictable(page_mapping(page)))
3230 return 0; 3245 return 0;
3231 3246
3232 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3247 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3233 return 0; 3248 return 0;
3234 3249
3235 return 1; 3250 return 1;
3236 } 3251 }
3237 3252
3238 /** 3253 /**
3239 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list 3254 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
3240 * @page: page to check evictability and move to appropriate lru list 3255 * @page: page to check evictability and move to appropriate lru list
3241 * @zone: zone page is in 3256 * @zone: zone page is in
3242 * 3257 *
3243 * Checks a page for evictability and moves the page to the appropriate 3258 * Checks a page for evictability and moves the page to the appropriate
3244 * zone lru list. 3259 * zone lru list.
3245 * 3260 *
3246 * Restrictions: zone->lru_lock must be held, page must be on LRU and must 3261 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
3247 * have PageUnevictable set. 3262 * have PageUnevictable set.
3248 */ 3263 */
3249 static void check_move_unevictable_page(struct page *page, struct zone *zone) 3264 static void check_move_unevictable_page(struct page *page, struct zone *zone)
3250 { 3265 {
3251 VM_BUG_ON(PageActive(page)); 3266 VM_BUG_ON(PageActive(page));
3252 3267
3253 retry: 3268 retry:
3254 ClearPageUnevictable(page); 3269 ClearPageUnevictable(page);
3255 if (page_evictable(page, NULL)) { 3270 if (page_evictable(page, NULL)) {
3256 enum lru_list l = page_lru_base_type(page); 3271 enum lru_list l = page_lru_base_type(page);
3257 3272
3258 __dec_zone_state(zone, NR_UNEVICTABLE); 3273 __dec_zone_state(zone, NR_UNEVICTABLE);
3259 list_move(&page->lru, &zone->lru[l].list); 3274 list_move(&page->lru, &zone->lru[l].list);
3260 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); 3275 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
3261 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3276 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3262 __count_vm_event(UNEVICTABLE_PGRESCUED); 3277 __count_vm_event(UNEVICTABLE_PGRESCUED);
3263 } else { 3278 } else {
3264 /* 3279 /*
3265 * rotate unevictable list 3280 * rotate unevictable list
3266 */ 3281 */
3267 SetPageUnevictable(page); 3282 SetPageUnevictable(page);
3268 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 3283 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
3269 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); 3284 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
3270 if (page_evictable(page, NULL)) 3285 if (page_evictable(page, NULL))
3271 goto retry; 3286 goto retry;
3272 } 3287 }
3273 } 3288 }
3274 3289
3275 /** 3290 /**
3276 * scan_mapping_unevictable_pages - scan an address space for evictable pages 3291 * scan_mapping_unevictable_pages - scan an address space for evictable pages
3277 * @mapping: struct address_space to scan for evictable pages 3292 * @mapping: struct address_space to scan for evictable pages
3278 * 3293 *
3279 * Scan all pages in mapping. Check unevictable pages for 3294 * Scan all pages in mapping. Check unevictable pages for
3280 * evictability and move them to the appropriate zone lru list. 3295 * evictability and move them to the appropriate zone lru list.
3281 */ 3296 */
3282 void scan_mapping_unevictable_pages(struct address_space *mapping) 3297 void scan_mapping_unevictable_pages(struct address_space *mapping)
3283 { 3298 {
3284 pgoff_t next = 0; 3299 pgoff_t next = 0;
3285 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> 3300 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
3286 PAGE_CACHE_SHIFT; 3301 PAGE_CACHE_SHIFT;
3287 struct zone *zone; 3302 struct zone *zone;
3288 struct pagevec pvec; 3303 struct pagevec pvec;
3289 3304
3290 if (mapping->nrpages == 0) 3305 if (mapping->nrpages == 0)
3291 return; 3306 return;
3292 3307
3293 pagevec_init(&pvec, 0); 3308 pagevec_init(&pvec, 0);
3294 while (next < end && 3309 while (next < end &&
3295 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 3310 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
3296 int i; 3311 int i;
3297 int pg_scanned = 0; 3312 int pg_scanned = 0;
3298 3313
3299 zone = NULL; 3314 zone = NULL;
3300 3315
3301 for (i = 0; i < pagevec_count(&pvec); i++) { 3316 for (i = 0; i < pagevec_count(&pvec); i++) {
3302 struct page *page = pvec.pages[i]; 3317 struct page *page = pvec.pages[i];
3303 pgoff_t page_index = page->index; 3318 pgoff_t page_index = page->index;
3304 struct zone *pagezone = page_zone(page); 3319 struct zone *pagezone = page_zone(page);
3305 3320
3306 pg_scanned++; 3321 pg_scanned++;
3307 if (page_index > next) 3322 if (page_index > next)
3308 next = page_index; 3323 next = page_index;
3309 next++; 3324 next++;
3310 3325
3311 if (pagezone != zone) { 3326 if (pagezone != zone) {
3312 if (zone) 3327 if (zone)
3313 spin_unlock_irq(&zone->lru_lock); 3328 spin_unlock_irq(&zone->lru_lock);
3314 zone = pagezone; 3329 zone = pagezone;
3315 spin_lock_irq(&zone->lru_lock); 3330 spin_lock_irq(&zone->lru_lock);
3316 } 3331 }
3317 3332
3318 if (PageLRU(page) && PageUnevictable(page)) 3333 if (PageLRU(page) && PageUnevictable(page))
3319 check_move_unevictable_page(page, zone); 3334 check_move_unevictable_page(page, zone);
3320 } 3335 }
3321 if (zone) 3336 if (zone)
3322 spin_unlock_irq(&zone->lru_lock); 3337 spin_unlock_irq(&zone->lru_lock);
3323 pagevec_release(&pvec); 3338 pagevec_release(&pvec);
3324 3339
3325 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); 3340 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
3326 } 3341 }
3327 3342
3328 } 3343 }
3329 3344
3330 /** 3345 /**
3331 * scan_zone_unevictable_pages - check unevictable list for evictable pages 3346 * scan_zone_unevictable_pages - check unevictable list for evictable pages
3332 * @zone - zone of which to scan the unevictable list 3347 * @zone - zone of which to scan the unevictable list
3333 * 3348 *
3334 * Scan @zone's unevictable LRU lists to check for pages that have become 3349 * Scan @zone's unevictable LRU lists to check for pages that have become
3335 * evictable. Move those that have to @zone's inactive list where they 3350 * evictable. Move those that have to @zone's inactive list where they
3336 * become candidates for reclaim, unless shrink_inactive_zone() decides 3351 * become candidates for reclaim, unless shrink_inactive_zone() decides
3337 * to reactivate them. Pages that are still unevictable are rotated 3352 * to reactivate them. Pages that are still unevictable are rotated
3338 * back onto @zone's unevictable list. 3353 * back onto @zone's unevictable list.
3339 */ 3354 */
3340 #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 3355 #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
3341 static void scan_zone_unevictable_pages(struct zone *zone) 3356 static void scan_zone_unevictable_pages(struct zone *zone)
3342 { 3357 {
3343 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 3358 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
3344 unsigned long scan; 3359 unsigned long scan;
3345 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); 3360 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
3346 3361
3347 while (nr_to_scan > 0) { 3362 while (nr_to_scan > 0) {
3348 unsigned long batch_size = min(nr_to_scan, 3363 unsigned long batch_size = min(nr_to_scan,
3349 SCAN_UNEVICTABLE_BATCH_SIZE); 3364 SCAN_UNEVICTABLE_BATCH_SIZE);
3350 3365
3351 spin_lock_irq(&zone->lru_lock); 3366 spin_lock_irq(&zone->lru_lock);
3352 for (scan = 0; scan < batch_size; scan++) { 3367 for (scan = 0; scan < batch_size; scan++) {
3353 struct page *page = lru_to_page(l_unevictable); 3368 struct page *page = lru_to_page(l_unevictable);
3354 3369
3355 if (!trylock_page(page)) 3370 if (!trylock_page(page))
3356 continue; 3371 continue;
3357 3372
3358 prefetchw_prev_lru_page(page, l_unevictable, flags); 3373 prefetchw_prev_lru_page(page, l_unevictable, flags);
3359 3374
3360 if (likely(PageLRU(page) && PageUnevictable(page))) 3375 if (likely(PageLRU(page) && PageUnevictable(page)))
3361 check_move_unevictable_page(page, zone); 3376 check_move_unevictable_page(page, zone);
3362 3377
3363 unlock_page(page); 3378 unlock_page(page);
3364 } 3379 }
3365 spin_unlock_irq(&zone->lru_lock); 3380 spin_unlock_irq(&zone->lru_lock);
3366 3381
3367 nr_to_scan -= batch_size; 3382 nr_to_scan -= batch_size;
3368 } 3383 }
3369 } 3384 }
3370 3385
3371 3386
3372 /** 3387 /**
3373 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages 3388 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
3374 * 3389 *
3375 * A really big hammer: scan all zones' unevictable LRU lists to check for 3390 * A really big hammer: scan all zones' unevictable LRU lists to check for
3376 * pages that have become evictable. Move those back to the zones' 3391 * pages that have become evictable. Move those back to the zones'
3377 * inactive list where they become candidates for reclaim. 3392 * inactive list where they become candidates for reclaim.
3378 * This occurs when, e.g., we have unswappable pages on the unevictable lists, 3393 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
3379 * and we add swap to the system. As such, it runs in the context of a task 3394 * and we add swap to the system. As such, it runs in the context of a task
3380 * that has possibly/probably made some previously unevictable pages 3395 * that has possibly/probably made some previously unevictable pages
3381 * evictable. 3396 * evictable.
3382 */ 3397 */
3383 static void scan_all_zones_unevictable_pages(void) 3398 static void scan_all_zones_unevictable_pages(void)
3384 { 3399 {
3385 struct zone *zone; 3400 struct zone *zone;
3386 3401
3387 for_each_zone(zone) { 3402 for_each_zone(zone) {
3388 scan_zone_unevictable_pages(zone); 3403 scan_zone_unevictable_pages(zone);
3389 } 3404 }
3390 } 3405 }
3391 3406
3392 /* 3407 /*
3393 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3408 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
3394 * all nodes' unevictable lists for evictable pages 3409 * all nodes' unevictable lists for evictable pages
3395 */ 3410 */
3396 unsigned long scan_unevictable_pages; 3411 unsigned long scan_unevictable_pages;
3397 3412
3398 int scan_unevictable_handler(struct ctl_table *table, int write, 3413 int scan_unevictable_handler(struct ctl_table *table, int write,
3399 void __user *buffer, 3414 void __user *buffer,
3400 size_t *length, loff_t *ppos) 3415 size_t *length, loff_t *ppos)
3401 { 3416 {
3402 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3417 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3403 3418
3404 if (write && *(unsigned long *)table->data) 3419 if (write && *(unsigned long *)table->data)
3405 scan_all_zones_unevictable_pages(); 3420 scan_all_zones_unevictable_pages();
3406 3421
3407 scan_unevictable_pages = 0; 3422 scan_unevictable_pages = 0;
3408 return 0; 3423 return 0;
3409 } 3424 }
3410 3425
3411 #ifdef CONFIG_NUMA 3426 #ifdef CONFIG_NUMA
3412 /* 3427 /*
3413 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3428 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
3414 * a specified node's per zone unevictable lists for evictable pages. 3429 * a specified node's per zone unevictable lists for evictable pages.
3415 */ 3430 */
3416 3431
3417 static ssize_t read_scan_unevictable_node(struct sys_device *dev, 3432 static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3418 struct sysdev_attribute *attr, 3433 struct sysdev_attribute *attr,
3419 char *buf) 3434 char *buf)
3420 { 3435 {
3421 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3436 return sprintf(buf, "0\n"); /* always zero; should fit... */
3422 } 3437 }
3423 3438
3424 static ssize_t write_scan_unevictable_node(struct sys_device *dev, 3439 static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3425 struct sysdev_attribute *attr, 3440 struct sysdev_attribute *attr,
3426 const char *buf, size_t count) 3441 const char *buf, size_t count)
3427 { 3442 {
3428 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3443 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
3429 struct zone *zone; 3444 struct zone *zone;
3430 unsigned long res; 3445 unsigned long res;
3431 unsigned long req = strict_strtoul(buf, 10, &res); 3446 unsigned long req = strict_strtoul(buf, 10, &res);
3432 3447
3433 if (!req) 3448 if (!req)
3434 return 1; /* zero is no-op */ 3449 return 1; /* zero is no-op */
3435 3450
3436 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 3451 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3437 if (!populated_zone(zone)) 3452 if (!populated_zone(zone))
3438 continue; 3453 continue;
3439 scan_zone_unevictable_pages(zone); 3454 scan_zone_unevictable_pages(zone);
3440 } 3455 }
3441 return 1; 3456 return 1;
3442 } 3457 }
3443 3458
3444 3459
3445 static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3460 static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3446 read_scan_unevictable_node, 3461 read_scan_unevictable_node,
3447 write_scan_unevictable_node); 3462 write_scan_unevictable_node);
3448 3463
3449 int scan_unevictable_register_node(struct node *node) 3464 int scan_unevictable_register_node(struct node *node)
3450 { 3465 {
3451 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); 3466 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
3452 } 3467 }
3453 3468
3454 void scan_unevictable_unregister_node(struct node *node) 3469 void scan_unevictable_unregister_node(struct node *node)
3455 { 3470 {
3456 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3471 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3457 } 3472 }
3458 #endif 3473 #endif
3459 3474