Commit 6ffef5d8bfc16845e25a7ee784426382b5c82c20

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent fa6d2dd222

mm: do not use unnecessary atomic operations when adding pages to the LRU

commit 6fb81a17d21f2a138b8f424af4cf379f2b694060 upstream.

When adding pages to the LRU we clear the active bit unconditionally.
As the page could be reachable from other paths we cannot use unlocked
operations without risk of corruption such as a parallel
mark_page_accessed.  This patch tests if is necessary to clear the
active flag before using an atomic operation.  This potentially opens a
tiny race when PageActive is checked as mark_page_accessed could be
called after PageActive was checked.  The race already exists but this
patch changes it slightly.  The consequence is that that the page may be
promoted to the active list that might have been left on the inactive
list before the patch.  It's too tiny a race and too marginal a
consequence to always use atomic operations for.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 4 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/swap.c 2 * linux/mm/swap.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * This file contains the default values for the operation of the 8 * This file contains the default values for the operation of the
9 * Linux VM subsystem. Fine-tuning documentation can be found in 9 * Linux VM subsystem. Fine-tuning documentation can be found in
10 * Documentation/sysctl/vm.txt. 10 * Documentation/sysctl/vm.txt.
11 * Started 18.12.91 11 * Started 18.12.91
12 * Swap aging added 23.2.95, Stephen Tweedie. 12 * Swap aging added 23.2.95, Stephen Tweedie.
13 * Buffermem limits added 12.3.98, Rik van Riel. 13 * Buffermem limits added 12.3.98, Rik van Riel.
14 */ 14 */
15 15
16 #include <linux/mm.h> 16 #include <linux/mm.h>
17 #include <linux/sched.h> 17 #include <linux/sched.h>
18 #include <linux/kernel_stat.h> 18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/mman.h> 20 #include <linux/mman.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/pagevec.h> 22 #include <linux/pagevec.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/export.h> 24 #include <linux/export.h>
25 #include <linux/mm_inline.h> 25 #include <linux/mm_inline.h>
26 #include <linux/percpu_counter.h> 26 #include <linux/percpu_counter.h>
27 #include <linux/percpu.h> 27 #include <linux/percpu.h>
28 #include <linux/cpu.h> 28 #include <linux/cpu.h>
29 #include <linux/notifier.h> 29 #include <linux/notifier.h>
30 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h>
31 #include <linux/memcontrol.h> 31 #include <linux/memcontrol.h>
32 #include <linux/gfp.h> 32 #include <linux/gfp.h>
33 #include <linux/uio.h> 33 #include <linux/uio.h>
34 #include <linux/hugetlb.h> 34 #include <linux/hugetlb.h>
35 35
36 #include "internal.h" 36 #include "internal.h"
37 37
38 #define CREATE_TRACE_POINTS 38 #define CREATE_TRACE_POINTS
39 #include <trace/events/pagemap.h> 39 #include <trace/events/pagemap.h>
40 40
41 /* How many pages do we try to swap or page in/out together? */ 41 /* How many pages do we try to swap or page in/out together? */
42 int page_cluster; 42 int page_cluster;
43 43
44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
47 47
48 /* 48 /*
49 * This path almost never happens for VM activity - pages are normally 49 * This path almost never happens for VM activity - pages are normally
50 * freed via pagevecs. But it gets used by networking. 50 * freed via pagevecs. But it gets used by networking.
51 */ 51 */
52 static void __page_cache_release(struct page *page) 52 static void __page_cache_release(struct page *page)
53 { 53 {
54 if (PageLRU(page)) { 54 if (PageLRU(page)) {
55 struct zone *zone = page_zone(page); 55 struct zone *zone = page_zone(page);
56 struct lruvec *lruvec; 56 struct lruvec *lruvec;
57 unsigned long flags; 57 unsigned long flags;
58 58
59 spin_lock_irqsave(&zone->lru_lock, flags); 59 spin_lock_irqsave(&zone->lru_lock, flags);
60 lruvec = mem_cgroup_page_lruvec(page, zone); 60 lruvec = mem_cgroup_page_lruvec(page, zone);
61 VM_BUG_ON(!PageLRU(page)); 61 VM_BUG_ON(!PageLRU(page));
62 __ClearPageLRU(page); 62 __ClearPageLRU(page);
63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 63 del_page_from_lru_list(page, lruvec, page_off_lru(page));
64 spin_unlock_irqrestore(&zone->lru_lock, flags); 64 spin_unlock_irqrestore(&zone->lru_lock, flags);
65 } 65 }
66 } 66 }
67 67
68 static void __put_single_page(struct page *page) 68 static void __put_single_page(struct page *page)
69 { 69 {
70 __page_cache_release(page); 70 __page_cache_release(page);
71 free_hot_cold_page(page, false); 71 free_hot_cold_page(page, false);
72 } 72 }
73 73
74 static void __put_compound_page(struct page *page) 74 static void __put_compound_page(struct page *page)
75 { 75 {
76 compound_page_dtor *dtor; 76 compound_page_dtor *dtor;
77 77
78 __page_cache_release(page); 78 __page_cache_release(page);
79 dtor = get_compound_page_dtor(page); 79 dtor = get_compound_page_dtor(page);
80 (*dtor)(page); 80 (*dtor)(page);
81 } 81 }
82 82
83 static void put_compound_page(struct page *page) 83 static void put_compound_page(struct page *page)
84 { 84 {
85 if (unlikely(PageTail(page))) { 85 if (unlikely(PageTail(page))) {
86 /* __split_huge_page_refcount can run under us */ 86 /* __split_huge_page_refcount can run under us */
87 struct page *page_head = compound_head(page); 87 struct page *page_head = compound_head(page);
88 88
89 if (likely(page != page_head && 89 if (likely(page != page_head &&
90 get_page_unless_zero(page_head))) { 90 get_page_unless_zero(page_head))) {
91 unsigned long flags; 91 unsigned long flags;
92 92
93 /* 93 /*
94 * THP can not break up slab pages so avoid taking 94 * THP can not break up slab pages so avoid taking
95 * compound_lock(). Slab performs non-atomic bit ops 95 * compound_lock(). Slab performs non-atomic bit ops
96 * on page->flags for better performance. In particular 96 * on page->flags for better performance. In particular
97 * slab_unlock() in slub used to be a hot path. It is 97 * slab_unlock() in slub used to be a hot path. It is
98 * still hot on arches that do not support 98 * still hot on arches that do not support
99 * this_cpu_cmpxchg_double(). 99 * this_cpu_cmpxchg_double().
100 */ 100 */
101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 101 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
102 if (likely(PageTail(page))) { 102 if (likely(PageTail(page))) {
103 /* 103 /*
104 * __split_huge_page_refcount 104 * __split_huge_page_refcount
105 * cannot race here. 105 * cannot race here.
106 */ 106 */
107 VM_BUG_ON(!PageHead(page_head)); 107 VM_BUG_ON(!PageHead(page_head));
108 atomic_dec(&page->_mapcount); 108 atomic_dec(&page->_mapcount);
109 if (put_page_testzero(page_head)) 109 if (put_page_testzero(page_head))
110 VM_BUG_ON(1); 110 VM_BUG_ON(1);
111 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
112 __put_compound_page(page_head); 112 __put_compound_page(page_head);
113 return; 113 return;
114 } else 114 } else
115 /* 115 /*
116 * __split_huge_page_refcount 116 * __split_huge_page_refcount
117 * run before us, "page" was a 117 * run before us, "page" was a
118 * THP tail. The split 118 * THP tail. The split
119 * page_head has been freed 119 * page_head has been freed
120 * and reallocated as slab or 120 * and reallocated as slab or
121 * hugetlbfs page of smaller 121 * hugetlbfs page of smaller
122 * order (only possible if 122 * order (only possible if
123 * reallocated as slab on 123 * reallocated as slab on
124 * x86). 124 * x86).
125 */ 125 */
126 goto skip_lock; 126 goto skip_lock;
127 } 127 }
128 /* 128 /*
129 * page_head wasn't a dangling pointer but it 129 * page_head wasn't a dangling pointer but it
130 * may not be a head page anymore by the time 130 * may not be a head page anymore by the time
131 * we obtain the lock. That is ok as long as it 131 * we obtain the lock. That is ok as long as it
132 * can't be freed from under us. 132 * can't be freed from under us.
133 */ 133 */
134 flags = compound_lock_irqsave(page_head); 134 flags = compound_lock_irqsave(page_head);
135 if (unlikely(!PageTail(page))) { 135 if (unlikely(!PageTail(page))) {
136 /* __split_huge_page_refcount run before us */ 136 /* __split_huge_page_refcount run before us */
137 compound_unlock_irqrestore(page_head, flags); 137 compound_unlock_irqrestore(page_head, flags);
138 skip_lock: 138 skip_lock:
139 if (put_page_testzero(page_head)) { 139 if (put_page_testzero(page_head)) {
140 /* 140 /*
141 * The head page may have been 141 * The head page may have been
142 * freed and reallocated as a 142 * freed and reallocated as a
143 * compound page of smaller 143 * compound page of smaller
144 * order and then freed again. 144 * order and then freed again.
145 * All we know is that it 145 * All we know is that it
146 * cannot have become: a THP 146 * cannot have become: a THP
147 * page, a compound page of 147 * page, a compound page of
148 * higher order, a tail page. 148 * higher order, a tail page.
149 * That is because we still 149 * That is because we still
150 * hold the refcount of the 150 * hold the refcount of the
151 * split THP tail and 151 * split THP tail and
152 * page_head was the THP head 152 * page_head was the THP head
153 * before the split. 153 * before the split.
154 */ 154 */
155 if (PageHead(page_head)) 155 if (PageHead(page_head))
156 __put_compound_page(page_head); 156 __put_compound_page(page_head);
157 else 157 else
158 __put_single_page(page_head); 158 __put_single_page(page_head);
159 } 159 }
160 out_put_single: 160 out_put_single:
161 if (put_page_testzero(page)) 161 if (put_page_testzero(page))
162 __put_single_page(page); 162 __put_single_page(page);
163 return; 163 return;
164 } 164 }
165 VM_BUG_ON(page_head != page->first_page); 165 VM_BUG_ON(page_head != page->first_page);
166 /* 166 /*
167 * We can release the refcount taken by 167 * We can release the refcount taken by
168 * get_page_unless_zero() now that 168 * get_page_unless_zero() now that
169 * __split_huge_page_refcount() is blocked on 169 * __split_huge_page_refcount() is blocked on
170 * the compound_lock. 170 * the compound_lock.
171 */ 171 */
172 if (put_page_testzero(page_head)) 172 if (put_page_testzero(page_head))
173 VM_BUG_ON(1); 173 VM_BUG_ON(1);
174 /* __split_huge_page_refcount will wait now */ 174 /* __split_huge_page_refcount will wait now */
175 VM_BUG_ON(page_mapcount(page) <= 0); 175 VM_BUG_ON(page_mapcount(page) <= 0);
176 atomic_dec(&page->_mapcount); 176 atomic_dec(&page->_mapcount);
177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
178 VM_BUG_ON(atomic_read(&page->_count) != 0); 178 VM_BUG_ON(atomic_read(&page->_count) != 0);
179 compound_unlock_irqrestore(page_head, flags); 179 compound_unlock_irqrestore(page_head, flags);
180 180
181 if (put_page_testzero(page_head)) { 181 if (put_page_testzero(page_head)) {
182 if (PageHead(page_head)) 182 if (PageHead(page_head))
183 __put_compound_page(page_head); 183 __put_compound_page(page_head);
184 else 184 else
185 __put_single_page(page_head); 185 __put_single_page(page_head);
186 } 186 }
187 } else { 187 } else {
188 /* page_head is a dangling pointer */ 188 /* page_head is a dangling pointer */
189 VM_BUG_ON(PageTail(page)); 189 VM_BUG_ON(PageTail(page));
190 goto out_put_single; 190 goto out_put_single;
191 } 191 }
192 } else if (put_page_testzero(page)) { 192 } else if (put_page_testzero(page)) {
193 if (PageHead(page)) 193 if (PageHead(page))
194 __put_compound_page(page); 194 __put_compound_page(page);
195 else 195 else
196 __put_single_page(page); 196 __put_single_page(page);
197 } 197 }
198 } 198 }
199 199
200 void put_page(struct page *page) 200 void put_page(struct page *page)
201 { 201 {
202 if (unlikely(PageCompound(page))) 202 if (unlikely(PageCompound(page)))
203 put_compound_page(page); 203 put_compound_page(page);
204 else if (put_page_testzero(page)) 204 else if (put_page_testzero(page))
205 __put_single_page(page); 205 __put_single_page(page);
206 } 206 }
207 EXPORT_SYMBOL(put_page); 207 EXPORT_SYMBOL(put_page);
208 208
209 /* 209 /*
210 * This function is exported but must not be called by anything other 210 * This function is exported but must not be called by anything other
211 * than get_page(). It implements the slow path of get_page(). 211 * than get_page(). It implements the slow path of get_page().
212 */ 212 */
213 bool __get_page_tail(struct page *page) 213 bool __get_page_tail(struct page *page)
214 { 214 {
215 /* 215 /*
216 * This takes care of get_page() if run on a tail page 216 * This takes care of get_page() if run on a tail page
217 * returned by one of the get_user_pages/follow_page variants. 217 * returned by one of the get_user_pages/follow_page variants.
218 * get_user_pages/follow_page itself doesn't need the compound 218 * get_user_pages/follow_page itself doesn't need the compound
219 * lock because it runs __get_page_tail_foll() under the 219 * lock because it runs __get_page_tail_foll() under the
220 * proper PT lock that already serializes against 220 * proper PT lock that already serializes against
221 * split_huge_page(). 221 * split_huge_page().
222 */ 222 */
223 unsigned long flags; 223 unsigned long flags;
224 bool got = false; 224 bool got = false;
225 struct page *page_head = compound_head(page); 225 struct page *page_head = compound_head(page);
226 226
227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 227 if (likely(page != page_head && get_page_unless_zero(page_head))) {
228 /* Ref to put_compound_page() comment. */ 228 /* Ref to put_compound_page() comment. */
229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 229 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
230 if (likely(PageTail(page))) { 230 if (likely(PageTail(page))) {
231 /* 231 /*
232 * This is a hugetlbfs page or a slab 232 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount 233 * page. __split_huge_page_refcount
234 * cannot race here. 234 * cannot race here.
235 */ 235 */
236 VM_BUG_ON(!PageHead(page_head)); 236 VM_BUG_ON(!PageHead(page_head));
237 __get_page_tail_foll(page, false); 237 __get_page_tail_foll(page, false);
238 return true; 238 return true;
239 } else { 239 } else {
240 /* 240 /*
241 * __split_huge_page_refcount run 241 * __split_huge_page_refcount run
242 * before us, "page" was a THP 242 * before us, "page" was a THP
243 * tail. The split page_head has been 243 * tail. The split page_head has been
244 * freed and reallocated as slab or 244 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order 245 * hugetlbfs page of smaller order
246 * (only possible if reallocated as 246 * (only possible if reallocated as
247 * slab on x86). 247 * slab on x86).
248 */ 248 */
249 put_page(page_head); 249 put_page(page_head);
250 return false; 250 return false;
251 } 251 }
252 } 252 }
253 253
254 /* 254 /*
255 * page_head wasn't a dangling pointer but it 255 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time 256 * may not be a head page anymore by the time
257 * we obtain the lock. That is ok as long as it 257 * we obtain the lock. That is ok as long as it
258 * can't be freed from under us. 258 * can't be freed from under us.
259 */ 259 */
260 flags = compound_lock_irqsave(page_head); 260 flags = compound_lock_irqsave(page_head);
261 /* here __split_huge_page_refcount won't run anymore */ 261 /* here __split_huge_page_refcount won't run anymore */
262 if (likely(PageTail(page))) { 262 if (likely(PageTail(page))) {
263 __get_page_tail_foll(page, false); 263 __get_page_tail_foll(page, false);
264 got = true; 264 got = true;
265 } 265 }
266 compound_unlock_irqrestore(page_head, flags); 266 compound_unlock_irqrestore(page_head, flags);
267 if (unlikely(!got)) 267 if (unlikely(!got))
268 put_page(page_head); 268 put_page(page_head);
269 } 269 }
270 return got; 270 return got;
271 } 271 }
272 EXPORT_SYMBOL(__get_page_tail); 272 EXPORT_SYMBOL(__get_page_tail);
273 273
274 /** 274 /**
275 * put_pages_list() - release a list of pages 275 * put_pages_list() - release a list of pages
276 * @pages: list of pages threaded on page->lru 276 * @pages: list of pages threaded on page->lru
277 * 277 *
278 * Release a list of pages which are strung together on page.lru. Currently 278 * Release a list of pages which are strung together on page.lru. Currently
279 * used by read_cache_pages() and related error recovery code. 279 * used by read_cache_pages() and related error recovery code.
280 */ 280 */
281 void put_pages_list(struct list_head *pages) 281 void put_pages_list(struct list_head *pages)
282 { 282 {
283 while (!list_empty(pages)) { 283 while (!list_empty(pages)) {
284 struct page *victim; 284 struct page *victim;
285 285
286 victim = list_entry(pages->prev, struct page, lru); 286 victim = list_entry(pages->prev, struct page, lru);
287 list_del(&victim->lru); 287 list_del(&victim->lru);
288 page_cache_release(victim); 288 page_cache_release(victim);
289 } 289 }
290 } 290 }
291 EXPORT_SYMBOL(put_pages_list); 291 EXPORT_SYMBOL(put_pages_list);
292 292
293 /* 293 /*
294 * get_kernel_pages() - pin kernel pages in memory 294 * get_kernel_pages() - pin kernel pages in memory
295 * @kiov: An array of struct kvec structures 295 * @kiov: An array of struct kvec structures
296 * @nr_segs: number of segments to pin 296 * @nr_segs: number of segments to pin
297 * @write: pinning for read/write, currently ignored 297 * @write: pinning for read/write, currently ignored
298 * @pages: array that receives pointers to the pages pinned. 298 * @pages: array that receives pointers to the pages pinned.
299 * Should be at least nr_segs long. 299 * Should be at least nr_segs long.
300 * 300 *
301 * Returns number of pages pinned. This may be fewer than the number 301 * Returns number of pages pinned. This may be fewer than the number
302 * requested. If nr_pages is 0 or negative, returns 0. If no pages 302 * requested. If nr_pages is 0 or negative, returns 0. If no pages
303 * were pinned, returns -errno. Each page returned must be released 303 * were pinned, returns -errno. Each page returned must be released
304 * with a put_page() call when it is finished with. 304 * with a put_page() call when it is finished with.
305 */ 305 */
306 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 306 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
307 struct page **pages) 307 struct page **pages)
308 { 308 {
309 int seg; 309 int seg;
310 310
311 for (seg = 0; seg < nr_segs; seg++) { 311 for (seg = 0; seg < nr_segs; seg++) {
312 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 312 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
313 return seg; 313 return seg;
314 314
315 pages[seg] = kmap_to_page(kiov[seg].iov_base); 315 pages[seg] = kmap_to_page(kiov[seg].iov_base);
316 page_cache_get(pages[seg]); 316 page_cache_get(pages[seg]);
317 } 317 }
318 318
319 return seg; 319 return seg;
320 } 320 }
321 EXPORT_SYMBOL_GPL(get_kernel_pages); 321 EXPORT_SYMBOL_GPL(get_kernel_pages);
322 322
323 /* 323 /*
324 * get_kernel_page() - pin a kernel page in memory 324 * get_kernel_page() - pin a kernel page in memory
325 * @start: starting kernel address 325 * @start: starting kernel address
326 * @write: pinning for read/write, currently ignored 326 * @write: pinning for read/write, currently ignored
327 * @pages: array that receives pointer to the page pinned. 327 * @pages: array that receives pointer to the page pinned.
328 * Must be at least nr_segs long. 328 * Must be at least nr_segs long.
329 * 329 *
330 * Returns 1 if page is pinned. If the page was not pinned, returns 330 * Returns 1 if page is pinned. If the page was not pinned, returns
331 * -errno. The page returned must be released with a put_page() call 331 * -errno. The page returned must be released with a put_page() call
332 * when it is finished with. 332 * when it is finished with.
333 */ 333 */
334 int get_kernel_page(unsigned long start, int write, struct page **pages) 334 int get_kernel_page(unsigned long start, int write, struct page **pages)
335 { 335 {
336 const struct kvec kiov = { 336 const struct kvec kiov = {
337 .iov_base = (void *)start, 337 .iov_base = (void *)start,
338 .iov_len = PAGE_SIZE 338 .iov_len = PAGE_SIZE
339 }; 339 };
340 340
341 return get_kernel_pages(&kiov, 1, write, pages); 341 return get_kernel_pages(&kiov, 1, write, pages);
342 } 342 }
343 EXPORT_SYMBOL_GPL(get_kernel_page); 343 EXPORT_SYMBOL_GPL(get_kernel_page);
344 344
345 static void pagevec_lru_move_fn(struct pagevec *pvec, 345 static void pagevec_lru_move_fn(struct pagevec *pvec,
346 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 346 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
347 void *arg) 347 void *arg)
348 { 348 {
349 int i; 349 int i;
350 struct zone *zone = NULL; 350 struct zone *zone = NULL;
351 struct lruvec *lruvec; 351 struct lruvec *lruvec;
352 unsigned long flags = 0; 352 unsigned long flags = 0;
353 353
354 for (i = 0; i < pagevec_count(pvec); i++) { 354 for (i = 0; i < pagevec_count(pvec); i++) {
355 struct page *page = pvec->pages[i]; 355 struct page *page = pvec->pages[i];
356 struct zone *pagezone = page_zone(page); 356 struct zone *pagezone = page_zone(page);
357 357
358 if (pagezone != zone) { 358 if (pagezone != zone) {
359 if (zone) 359 if (zone)
360 spin_unlock_irqrestore(&zone->lru_lock, flags); 360 spin_unlock_irqrestore(&zone->lru_lock, flags);
361 zone = pagezone; 361 zone = pagezone;
362 spin_lock_irqsave(&zone->lru_lock, flags); 362 spin_lock_irqsave(&zone->lru_lock, flags);
363 } 363 }
364 364
365 lruvec = mem_cgroup_page_lruvec(page, zone); 365 lruvec = mem_cgroup_page_lruvec(page, zone);
366 (*move_fn)(page, lruvec, arg); 366 (*move_fn)(page, lruvec, arg);
367 } 367 }
368 if (zone) 368 if (zone)
369 spin_unlock_irqrestore(&zone->lru_lock, flags); 369 spin_unlock_irqrestore(&zone->lru_lock, flags);
370 release_pages(pvec->pages, pvec->nr, pvec->cold); 370 release_pages(pvec->pages, pvec->nr, pvec->cold);
371 pagevec_reinit(pvec); 371 pagevec_reinit(pvec);
372 } 372 }
373 373
374 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 374 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
375 void *arg) 375 void *arg)
376 { 376 {
377 int *pgmoved = arg; 377 int *pgmoved = arg;
378 378
379 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 379 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
380 enum lru_list lru = page_lru_base_type(page); 380 enum lru_list lru = page_lru_base_type(page);
381 list_move_tail(&page->lru, &lruvec->lists[lru]); 381 list_move_tail(&page->lru, &lruvec->lists[lru]);
382 (*pgmoved)++; 382 (*pgmoved)++;
383 } 383 }
384 } 384 }
385 385
386 /* 386 /*
387 * pagevec_move_tail() must be called with IRQ disabled. 387 * pagevec_move_tail() must be called with IRQ disabled.
388 * Otherwise this may cause nasty races. 388 * Otherwise this may cause nasty races.
389 */ 389 */
390 static void pagevec_move_tail(struct pagevec *pvec) 390 static void pagevec_move_tail(struct pagevec *pvec)
391 { 391 {
392 int pgmoved = 0; 392 int pgmoved = 0;
393 393
394 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 394 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
395 __count_vm_events(PGROTATED, pgmoved); 395 __count_vm_events(PGROTATED, pgmoved);
396 } 396 }
397 397
398 /* 398 /*
399 * Writeback is about to end against a page which has been marked for immediate 399 * Writeback is about to end against a page which has been marked for immediate
400 * reclaim. If it still appears to be reclaimable, move it to the tail of the 400 * reclaim. If it still appears to be reclaimable, move it to the tail of the
401 * inactive list. 401 * inactive list.
402 */ 402 */
403 void rotate_reclaimable_page(struct page *page) 403 void rotate_reclaimable_page(struct page *page)
404 { 404 {
405 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 405 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
406 !PageUnevictable(page) && PageLRU(page)) { 406 !PageUnevictable(page) && PageLRU(page)) {
407 struct pagevec *pvec; 407 struct pagevec *pvec;
408 unsigned long flags; 408 unsigned long flags;
409 409
410 page_cache_get(page); 410 page_cache_get(page);
411 local_irq_save(flags); 411 local_irq_save(flags);
412 pvec = &__get_cpu_var(lru_rotate_pvecs); 412 pvec = &__get_cpu_var(lru_rotate_pvecs);
413 if (!pagevec_add(pvec, page)) 413 if (!pagevec_add(pvec, page))
414 pagevec_move_tail(pvec); 414 pagevec_move_tail(pvec);
415 local_irq_restore(flags); 415 local_irq_restore(flags);
416 } 416 }
417 } 417 }
418 418
419 static void update_page_reclaim_stat(struct lruvec *lruvec, 419 static void update_page_reclaim_stat(struct lruvec *lruvec,
420 int file, int rotated) 420 int file, int rotated)
421 { 421 {
422 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 422 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
423 423
424 reclaim_stat->recent_scanned[file]++; 424 reclaim_stat->recent_scanned[file]++;
425 if (rotated) 425 if (rotated)
426 reclaim_stat->recent_rotated[file]++; 426 reclaim_stat->recent_rotated[file]++;
427 } 427 }
428 428
429 static void __activate_page(struct page *page, struct lruvec *lruvec, 429 static void __activate_page(struct page *page, struct lruvec *lruvec,
430 void *arg) 430 void *arg)
431 { 431 {
432 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 432 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
433 int file = page_is_file_cache(page); 433 int file = page_is_file_cache(page);
434 int lru = page_lru_base_type(page); 434 int lru = page_lru_base_type(page);
435 435
436 del_page_from_lru_list(page, lruvec, lru); 436 del_page_from_lru_list(page, lruvec, lru);
437 SetPageActive(page); 437 SetPageActive(page);
438 lru += LRU_ACTIVE; 438 lru += LRU_ACTIVE;
439 add_page_to_lru_list(page, lruvec, lru); 439 add_page_to_lru_list(page, lruvec, lru);
440 trace_mm_lru_activate(page, page_to_pfn(page)); 440 trace_mm_lru_activate(page, page_to_pfn(page));
441 441
442 __count_vm_event(PGACTIVATE); 442 __count_vm_event(PGACTIVATE);
443 update_page_reclaim_stat(lruvec, file, 1); 443 update_page_reclaim_stat(lruvec, file, 1);
444 } 444 }
445 } 445 }
446 446
447 #ifdef CONFIG_SMP 447 #ifdef CONFIG_SMP
448 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 448 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
449 449
450 static void activate_page_drain(int cpu) 450 static void activate_page_drain(int cpu)
451 { 451 {
452 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 452 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
453 453
454 if (pagevec_count(pvec)) 454 if (pagevec_count(pvec))
455 pagevec_lru_move_fn(pvec, __activate_page, NULL); 455 pagevec_lru_move_fn(pvec, __activate_page, NULL);
456 } 456 }
457 457
458 static bool need_activate_page_drain(int cpu) 458 static bool need_activate_page_drain(int cpu)
459 { 459 {
460 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 460 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
461 } 461 }
462 462
463 void activate_page(struct page *page) 463 void activate_page(struct page *page)
464 { 464 {
465 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 465 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
466 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 466 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
467 467
468 page_cache_get(page); 468 page_cache_get(page);
469 if (!pagevec_add(pvec, page)) 469 if (!pagevec_add(pvec, page))
470 pagevec_lru_move_fn(pvec, __activate_page, NULL); 470 pagevec_lru_move_fn(pvec, __activate_page, NULL);
471 put_cpu_var(activate_page_pvecs); 471 put_cpu_var(activate_page_pvecs);
472 } 472 }
473 } 473 }
474 474
475 #else 475 #else
476 static inline void activate_page_drain(int cpu) 476 static inline void activate_page_drain(int cpu)
477 { 477 {
478 } 478 }
479 479
480 static bool need_activate_page_drain(int cpu) 480 static bool need_activate_page_drain(int cpu)
481 { 481 {
482 return false; 482 return false;
483 } 483 }
484 484
485 void activate_page(struct page *page) 485 void activate_page(struct page *page)
486 { 486 {
487 struct zone *zone = page_zone(page); 487 struct zone *zone = page_zone(page);
488 488
489 spin_lock_irq(&zone->lru_lock); 489 spin_lock_irq(&zone->lru_lock);
490 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 490 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
491 spin_unlock_irq(&zone->lru_lock); 491 spin_unlock_irq(&zone->lru_lock);
492 } 492 }
493 #endif 493 #endif
494 494
495 static void __lru_cache_activate_page(struct page *page) 495 static void __lru_cache_activate_page(struct page *page)
496 { 496 {
497 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 497 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
498 int i; 498 int i;
499 499
500 /* 500 /*
501 * Search backwards on the optimistic assumption that the page being 501 * Search backwards on the optimistic assumption that the page being
502 * activated has just been added to this pagevec. Note that only 502 * activated has just been added to this pagevec. Note that only
503 * the local pagevec is examined as a !PageLRU page could be in the 503 * the local pagevec is examined as a !PageLRU page could be in the
504 * process of being released, reclaimed, migrated or on a remote 504 * process of being released, reclaimed, migrated or on a remote
505 * pagevec that is currently being drained. Furthermore, marking 505 * pagevec that is currently being drained. Furthermore, marking
506 * a remote pagevec's page PageActive potentially hits a race where 506 * a remote pagevec's page PageActive potentially hits a race where
507 * a page is marked PageActive just after it is added to the inactive 507 * a page is marked PageActive just after it is added to the inactive
508 * list causing accounting errors and BUG_ON checks to trigger. 508 * list causing accounting errors and BUG_ON checks to trigger.
509 */ 509 */
510 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 510 for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
511 struct page *pagevec_page = pvec->pages[i]; 511 struct page *pagevec_page = pvec->pages[i];
512 512
513 if (pagevec_page == page) { 513 if (pagevec_page == page) {
514 SetPageActive(page); 514 SetPageActive(page);
515 break; 515 break;
516 } 516 }
517 } 517 }
518 518
519 put_cpu_var(lru_add_pvec); 519 put_cpu_var(lru_add_pvec);
520 } 520 }
521 521
522 /* 522 /*
523 * Mark a page as having seen activity. 523 * Mark a page as having seen activity.
524 * 524 *
525 * inactive,unreferenced -> inactive,referenced 525 * inactive,unreferenced -> inactive,referenced
526 * inactive,referenced -> active,unreferenced 526 * inactive,referenced -> active,unreferenced
527 * active,unreferenced -> active,referenced 527 * active,unreferenced -> active,referenced
528 */ 528 */
529 void mark_page_accessed(struct page *page) 529 void mark_page_accessed(struct page *page)
530 { 530 {
531 if (!PageActive(page) && !PageUnevictable(page) && 531 if (!PageActive(page) && !PageUnevictable(page) &&
532 PageReferenced(page)) { 532 PageReferenced(page)) {
533 533
534 /* 534 /*
535 * If the page is on the LRU, queue it for activation via 535 * If the page is on the LRU, queue it for activation via
536 * activate_page_pvecs. Otherwise, assume the page is on a 536 * activate_page_pvecs. Otherwise, assume the page is on a
537 * pagevec, mark it active and it'll be moved to the active 537 * pagevec, mark it active and it'll be moved to the active
538 * LRU on the next drain. 538 * LRU on the next drain.
539 */ 539 */
540 if (PageLRU(page)) 540 if (PageLRU(page))
541 activate_page(page); 541 activate_page(page);
542 else 542 else
543 __lru_cache_activate_page(page); 543 __lru_cache_activate_page(page);
544 ClearPageReferenced(page); 544 ClearPageReferenced(page);
545 } else if (!PageReferenced(page)) { 545 } else if (!PageReferenced(page)) {
546 SetPageReferenced(page); 546 SetPageReferenced(page);
547 } 547 }
548 } 548 }
549 EXPORT_SYMBOL(mark_page_accessed); 549 EXPORT_SYMBOL(mark_page_accessed);
550 550
551 static void __lru_cache_add(struct page *page) 551 static void __lru_cache_add(struct page *page)
552 { 552 {
553 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 553 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
554 554
555 page_cache_get(page); 555 page_cache_get(page);
556 if (!pagevec_space(pvec)) 556 if (!pagevec_space(pvec))
557 __pagevec_lru_add(pvec); 557 __pagevec_lru_add(pvec);
558 pagevec_add(pvec, page); 558 pagevec_add(pvec, page);
559 put_cpu_var(lru_add_pvec); 559 put_cpu_var(lru_add_pvec);
560 } 560 }
561 561
562 /** 562 /**
563 * lru_cache_add: add a page to the page lists 563 * lru_cache_add: add a page to the page lists
564 * @page: the page to add 564 * @page: the page to add
565 */ 565 */
566 void lru_cache_add_anon(struct page *page) 566 void lru_cache_add_anon(struct page *page)
567 { 567 {
568 ClearPageActive(page); 568 if (PageActive(page))
569 ClearPageActive(page);
569 __lru_cache_add(page); 570 __lru_cache_add(page);
570 } 571 }
571 572
572 void lru_cache_add_file(struct page *page) 573 void lru_cache_add_file(struct page *page)
573 { 574 {
574 ClearPageActive(page); 575 if (PageActive(page))
576 ClearPageActive(page);
575 __lru_cache_add(page); 577 __lru_cache_add(page);
576 } 578 }
577 EXPORT_SYMBOL(lru_cache_add_file); 579 EXPORT_SYMBOL(lru_cache_add_file);
578 580
579 /** 581 /**
580 * lru_cache_add - add a page to a page list 582 * lru_cache_add - add a page to a page list
581 * @page: the page to be added to the LRU. 583 * @page: the page to be added to the LRU.
582 * 584 *
583 * Queue the page for addition to the LRU via pagevec. The decision on whether 585 * Queue the page for addition to the LRU via pagevec. The decision on whether
584 * to add the page to the [in]active [file|anon] list is deferred until the 586 * to add the page to the [in]active [file|anon] list is deferred until the
585 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 587 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
586 * have the page added to the active list using mark_page_accessed(). 588 * have the page added to the active list using mark_page_accessed().
587 */ 589 */
588 void lru_cache_add(struct page *page) 590 void lru_cache_add(struct page *page)
589 { 591 {
590 VM_BUG_ON(PageActive(page) && PageUnevictable(page)); 592 VM_BUG_ON(PageActive(page) && PageUnevictable(page));
591 VM_BUG_ON(PageLRU(page)); 593 VM_BUG_ON(PageLRU(page));
592 __lru_cache_add(page); 594 __lru_cache_add(page);
593 } 595 }
594 596
595 /** 597 /**
596 * add_page_to_unevictable_list - add a page to the unevictable list 598 * add_page_to_unevictable_list - add a page to the unevictable list
597 * @page: the page to be added to the unevictable list 599 * @page: the page to be added to the unevictable list
598 * 600 *
599 * Add page directly to its zone's unevictable list. To avoid races with 601 * Add page directly to its zone's unevictable list. To avoid races with
600 * tasks that might be making the page evictable, through eg. munlock, 602 * tasks that might be making the page evictable, through eg. munlock,
601 * munmap or exit, while it's not on the lru, we want to add the page 603 * munmap or exit, while it's not on the lru, we want to add the page
602 * while it's locked or otherwise "invisible" to other tasks. This is 604 * while it's locked or otherwise "invisible" to other tasks. This is
603 * difficult to do when using the pagevec cache, so bypass that. 605 * difficult to do when using the pagevec cache, so bypass that.
604 */ 606 */
605 void add_page_to_unevictable_list(struct page *page) 607 void add_page_to_unevictable_list(struct page *page)
606 { 608 {
607 struct zone *zone = page_zone(page); 609 struct zone *zone = page_zone(page);
608 struct lruvec *lruvec; 610 struct lruvec *lruvec;
609 611
610 spin_lock_irq(&zone->lru_lock); 612 spin_lock_irq(&zone->lru_lock);
611 lruvec = mem_cgroup_page_lruvec(page, zone); 613 lruvec = mem_cgroup_page_lruvec(page, zone);
612 ClearPageActive(page); 614 ClearPageActive(page);
613 SetPageUnevictable(page); 615 SetPageUnevictable(page);
614 SetPageLRU(page); 616 SetPageLRU(page);
615 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 617 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
616 spin_unlock_irq(&zone->lru_lock); 618 spin_unlock_irq(&zone->lru_lock);
617 } 619 }
618 620
619 /* 621 /*
620 * If the page can not be invalidated, it is moved to the 622 * If the page can not be invalidated, it is moved to the
621 * inactive list to speed up its reclaim. It is moved to the 623 * inactive list to speed up its reclaim. It is moved to the
622 * head of the list, rather than the tail, to give the flusher 624 * head of the list, rather than the tail, to give the flusher
623 * threads some time to write it out, as this is much more 625 * threads some time to write it out, as this is much more
624 * effective than the single-page writeout from reclaim. 626 * effective than the single-page writeout from reclaim.
625 * 627 *
626 * If the page isn't page_mapped and dirty/writeback, the page 628 * If the page isn't page_mapped and dirty/writeback, the page
627 * could reclaim asap using PG_reclaim. 629 * could reclaim asap using PG_reclaim.
628 * 630 *
629 * 1. active, mapped page -> none 631 * 1. active, mapped page -> none
630 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 632 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
631 * 3. inactive, mapped page -> none 633 * 3. inactive, mapped page -> none
632 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 634 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
633 * 5. inactive, clean -> inactive, tail 635 * 5. inactive, clean -> inactive, tail
634 * 6. Others -> none 636 * 6. Others -> none
635 * 637 *
636 * In 4, why it moves inactive's head, the VM expects the page would 638 * In 4, why it moves inactive's head, the VM expects the page would
637 * be write it out by flusher threads as this is much more effective 639 * be write it out by flusher threads as this is much more effective
638 * than the single-page writeout from reclaim. 640 * than the single-page writeout from reclaim.
639 */ 641 */
640 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 642 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
641 void *arg) 643 void *arg)
642 { 644 {
643 int lru, file; 645 int lru, file;
644 bool active; 646 bool active;
645 647
646 if (!PageLRU(page)) 648 if (!PageLRU(page))
647 return; 649 return;
648 650
649 if (PageUnevictable(page)) 651 if (PageUnevictable(page))
650 return; 652 return;
651 653
652 /* Some processes are using the page */ 654 /* Some processes are using the page */
653 if (page_mapped(page)) 655 if (page_mapped(page))
654 return; 656 return;
655 657
656 active = PageActive(page); 658 active = PageActive(page);
657 file = page_is_file_cache(page); 659 file = page_is_file_cache(page);
658 lru = page_lru_base_type(page); 660 lru = page_lru_base_type(page);
659 661
660 del_page_from_lru_list(page, lruvec, lru + active); 662 del_page_from_lru_list(page, lruvec, lru + active);
661 ClearPageActive(page); 663 ClearPageActive(page);
662 ClearPageReferenced(page); 664 ClearPageReferenced(page);
663 add_page_to_lru_list(page, lruvec, lru); 665 add_page_to_lru_list(page, lruvec, lru);
664 666
665 if (PageWriteback(page) || PageDirty(page)) { 667 if (PageWriteback(page) || PageDirty(page)) {
666 /* 668 /*
667 * PG_reclaim could be raced with end_page_writeback 669 * PG_reclaim could be raced with end_page_writeback
668 * It can make readahead confusing. But race window 670 * It can make readahead confusing. But race window
669 * is _really_ small and it's non-critical problem. 671 * is _really_ small and it's non-critical problem.
670 */ 672 */
671 SetPageReclaim(page); 673 SetPageReclaim(page);
672 } else { 674 } else {
673 /* 675 /*
674 * The page's writeback ends up during pagevec 676 * The page's writeback ends up during pagevec
675 * We moves tha page into tail of inactive. 677 * We moves tha page into tail of inactive.
676 */ 678 */
677 list_move_tail(&page->lru, &lruvec->lists[lru]); 679 list_move_tail(&page->lru, &lruvec->lists[lru]);
678 __count_vm_event(PGROTATED); 680 __count_vm_event(PGROTATED);
679 } 681 }
680 682
681 if (active) 683 if (active)
682 __count_vm_event(PGDEACTIVATE); 684 __count_vm_event(PGDEACTIVATE);
683 update_page_reclaim_stat(lruvec, file, 0); 685 update_page_reclaim_stat(lruvec, file, 0);
684 } 686 }
685 687
686 /* 688 /*
687 * Drain pages out of the cpu's pagevecs. 689 * Drain pages out of the cpu's pagevecs.
688 * Either "cpu" is the current CPU, and preemption has already been 690 * Either "cpu" is the current CPU, and preemption has already been
689 * disabled; or "cpu" is being hot-unplugged, and is already dead. 691 * disabled; or "cpu" is being hot-unplugged, and is already dead.
690 */ 692 */
691 void lru_add_drain_cpu(int cpu) 693 void lru_add_drain_cpu(int cpu)
692 { 694 {
693 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 695 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
694 696
695 if (pagevec_count(pvec)) 697 if (pagevec_count(pvec))
696 __pagevec_lru_add(pvec); 698 __pagevec_lru_add(pvec);
697 699
698 pvec = &per_cpu(lru_rotate_pvecs, cpu); 700 pvec = &per_cpu(lru_rotate_pvecs, cpu);
699 if (pagevec_count(pvec)) { 701 if (pagevec_count(pvec)) {
700 unsigned long flags; 702 unsigned long flags;
701 703
702 /* No harm done if a racing interrupt already did this */ 704 /* No harm done if a racing interrupt already did this */
703 local_irq_save(flags); 705 local_irq_save(flags);
704 pagevec_move_tail(pvec); 706 pagevec_move_tail(pvec);
705 local_irq_restore(flags); 707 local_irq_restore(flags);
706 } 708 }
707 709
708 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 710 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
709 if (pagevec_count(pvec)) 711 if (pagevec_count(pvec))
710 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 712 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
711 713
712 activate_page_drain(cpu); 714 activate_page_drain(cpu);
713 } 715 }
714 716
715 /** 717 /**
716 * deactivate_page - forcefully deactivate a page 718 * deactivate_page - forcefully deactivate a page
717 * @page: page to deactivate 719 * @page: page to deactivate
718 * 720 *
719 * This function hints the VM that @page is a good reclaim candidate, 721 * This function hints the VM that @page is a good reclaim candidate,
720 * for example if its invalidation fails due to the page being dirty 722 * for example if its invalidation fails due to the page being dirty
721 * or under writeback. 723 * or under writeback.
722 */ 724 */
723 void deactivate_page(struct page *page) 725 void deactivate_page(struct page *page)
724 { 726 {
725 /* 727 /*
726 * In a workload with many unevictable page such as mprotect, unevictable 728 * In a workload with many unevictable page such as mprotect, unevictable
727 * page deactivation for accelerating reclaim is pointless. 729 * page deactivation for accelerating reclaim is pointless.
728 */ 730 */
729 if (PageUnevictable(page)) 731 if (PageUnevictable(page))
730 return; 732 return;
731 733
732 if (likely(get_page_unless_zero(page))) { 734 if (likely(get_page_unless_zero(page))) {
733 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 735 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
734 736
735 if (!pagevec_add(pvec, page)) 737 if (!pagevec_add(pvec, page))
736 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 738 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
737 put_cpu_var(lru_deactivate_pvecs); 739 put_cpu_var(lru_deactivate_pvecs);
738 } 740 }
739 } 741 }
740 742
741 void lru_add_drain(void) 743 void lru_add_drain(void)
742 { 744 {
743 lru_add_drain_cpu(get_cpu()); 745 lru_add_drain_cpu(get_cpu());
744 put_cpu(); 746 put_cpu();
745 } 747 }
746 748
747 static void lru_add_drain_per_cpu(struct work_struct *dummy) 749 static void lru_add_drain_per_cpu(struct work_struct *dummy)
748 { 750 {
749 lru_add_drain(); 751 lru_add_drain();
750 } 752 }
751 753
752 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 754 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
753 755
754 void lru_add_drain_all(void) 756 void lru_add_drain_all(void)
755 { 757 {
756 static DEFINE_MUTEX(lock); 758 static DEFINE_MUTEX(lock);
757 static struct cpumask has_work; 759 static struct cpumask has_work;
758 int cpu; 760 int cpu;
759 761
760 mutex_lock(&lock); 762 mutex_lock(&lock);
761 get_online_cpus(); 763 get_online_cpus();
762 cpumask_clear(&has_work); 764 cpumask_clear(&has_work);
763 765
764 for_each_online_cpu(cpu) { 766 for_each_online_cpu(cpu) {
765 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 767 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
766 768
767 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 769 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
768 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 770 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
769 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 771 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
770 need_activate_page_drain(cpu)) { 772 need_activate_page_drain(cpu)) {
771 INIT_WORK(work, lru_add_drain_per_cpu); 773 INIT_WORK(work, lru_add_drain_per_cpu);
772 schedule_work_on(cpu, work); 774 schedule_work_on(cpu, work);
773 cpumask_set_cpu(cpu, &has_work); 775 cpumask_set_cpu(cpu, &has_work);
774 } 776 }
775 } 777 }
776 778
777 for_each_cpu(cpu, &has_work) 779 for_each_cpu(cpu, &has_work)
778 flush_work(&per_cpu(lru_add_drain_work, cpu)); 780 flush_work(&per_cpu(lru_add_drain_work, cpu));
779 781
780 put_online_cpus(); 782 put_online_cpus();
781 mutex_unlock(&lock); 783 mutex_unlock(&lock);
782 } 784 }
783 785
784 /* 786 /*
785 * Batched page_cache_release(). Decrement the reference count on all the 787 * Batched page_cache_release(). Decrement the reference count on all the
786 * passed pages. If it fell to zero then remove the page from the LRU and 788 * passed pages. If it fell to zero then remove the page from the LRU and
787 * free it. 789 * free it.
788 * 790 *
789 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 791 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
790 * for the remainder of the operation. 792 * for the remainder of the operation.
791 * 793 *
792 * The locking in this function is against shrink_inactive_list(): we recheck 794 * The locking in this function is against shrink_inactive_list(): we recheck
793 * the page count inside the lock to see whether shrink_inactive_list() 795 * the page count inside the lock to see whether shrink_inactive_list()
794 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 796 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
795 * will free it. 797 * will free it.
796 */ 798 */
797 void release_pages(struct page **pages, int nr, bool cold) 799 void release_pages(struct page **pages, int nr, bool cold)
798 { 800 {
799 int i; 801 int i;
800 LIST_HEAD(pages_to_free); 802 LIST_HEAD(pages_to_free);
801 struct zone *zone = NULL; 803 struct zone *zone = NULL;
802 struct lruvec *lruvec; 804 struct lruvec *lruvec;
803 unsigned long uninitialized_var(flags); 805 unsigned long uninitialized_var(flags);
804 806
805 for (i = 0; i < nr; i++) { 807 for (i = 0; i < nr; i++) {
806 struct page *page = pages[i]; 808 struct page *page = pages[i];
807 809
808 if (unlikely(PageCompound(page))) { 810 if (unlikely(PageCompound(page))) {
809 if (zone) { 811 if (zone) {
810 spin_unlock_irqrestore(&zone->lru_lock, flags); 812 spin_unlock_irqrestore(&zone->lru_lock, flags);
811 zone = NULL; 813 zone = NULL;
812 } 814 }
813 put_compound_page(page); 815 put_compound_page(page);
814 continue; 816 continue;
815 } 817 }
816 818
817 if (!put_page_testzero(page)) 819 if (!put_page_testzero(page))
818 continue; 820 continue;
819 821
820 if (PageLRU(page)) { 822 if (PageLRU(page)) {
821 struct zone *pagezone = page_zone(page); 823 struct zone *pagezone = page_zone(page);
822 824
823 if (pagezone != zone) { 825 if (pagezone != zone) {
824 if (zone) 826 if (zone)
825 spin_unlock_irqrestore(&zone->lru_lock, 827 spin_unlock_irqrestore(&zone->lru_lock,
826 flags); 828 flags);
827 zone = pagezone; 829 zone = pagezone;
828 spin_lock_irqsave(&zone->lru_lock, flags); 830 spin_lock_irqsave(&zone->lru_lock, flags);
829 } 831 }
830 832
831 lruvec = mem_cgroup_page_lruvec(page, zone); 833 lruvec = mem_cgroup_page_lruvec(page, zone);
832 VM_BUG_ON(!PageLRU(page)); 834 VM_BUG_ON(!PageLRU(page));
833 __ClearPageLRU(page); 835 __ClearPageLRU(page);
834 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 836 del_page_from_lru_list(page, lruvec, page_off_lru(page));
835 } 837 }
836 838
837 /* Clear Active bit in case of parallel mark_page_accessed */ 839 /* Clear Active bit in case of parallel mark_page_accessed */
838 __ClearPageActive(page); 840 __ClearPageActive(page);
839 841
840 list_add(&page->lru, &pages_to_free); 842 list_add(&page->lru, &pages_to_free);
841 } 843 }
842 if (zone) 844 if (zone)
843 spin_unlock_irqrestore(&zone->lru_lock, flags); 845 spin_unlock_irqrestore(&zone->lru_lock, flags);
844 846
845 free_hot_cold_page_list(&pages_to_free, cold); 847 free_hot_cold_page_list(&pages_to_free, cold);
846 } 848 }
847 EXPORT_SYMBOL(release_pages); 849 EXPORT_SYMBOL(release_pages);
848 850
849 /* 851 /*
850 * The pages which we're about to release may be in the deferred lru-addition 852 * The pages which we're about to release may be in the deferred lru-addition
851 * queues. That would prevent them from really being freed right now. That's 853 * queues. That would prevent them from really being freed right now. That's
852 * OK from a correctness point of view but is inefficient - those pages may be 854 * OK from a correctness point of view but is inefficient - those pages may be
853 * cache-warm and we want to give them back to the page allocator ASAP. 855 * cache-warm and we want to give them back to the page allocator ASAP.
854 * 856 *
855 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 857 * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
856 * and __pagevec_lru_add_active() call release_pages() directly to avoid 858 * and __pagevec_lru_add_active() call release_pages() directly to avoid
857 * mutual recursion. 859 * mutual recursion.
858 */ 860 */
859 void __pagevec_release(struct pagevec *pvec) 861 void __pagevec_release(struct pagevec *pvec)
860 { 862 {
861 lru_add_drain(); 863 lru_add_drain();
862 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 864 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
863 pagevec_reinit(pvec); 865 pagevec_reinit(pvec);
864 } 866 }
865 EXPORT_SYMBOL(__pagevec_release); 867 EXPORT_SYMBOL(__pagevec_release);
866 868
867 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 869 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
868 /* used by __split_huge_page_refcount() */ 870 /* used by __split_huge_page_refcount() */
869 void lru_add_page_tail(struct page *page, struct page *page_tail, 871 void lru_add_page_tail(struct page *page, struct page *page_tail,
870 struct lruvec *lruvec, struct list_head *list) 872 struct lruvec *lruvec, struct list_head *list)
871 { 873 {
872 const int file = 0; 874 const int file = 0;
873 875
874 VM_BUG_ON(!PageHead(page)); 876 VM_BUG_ON(!PageHead(page));
875 VM_BUG_ON(PageCompound(page_tail)); 877 VM_BUG_ON(PageCompound(page_tail));
876 VM_BUG_ON(PageLRU(page_tail)); 878 VM_BUG_ON(PageLRU(page_tail));
877 VM_BUG_ON(NR_CPUS != 1 && 879 VM_BUG_ON(NR_CPUS != 1 &&
878 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 880 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
879 881
880 if (!list) 882 if (!list)
881 SetPageLRU(page_tail); 883 SetPageLRU(page_tail);
882 884
883 if (likely(PageLRU(page))) 885 if (likely(PageLRU(page)))
884 list_add_tail(&page_tail->lru, &page->lru); 886 list_add_tail(&page_tail->lru, &page->lru);
885 else if (list) { 887 else if (list) {
886 /* page reclaim is reclaiming a huge page */ 888 /* page reclaim is reclaiming a huge page */
887 get_page(page_tail); 889 get_page(page_tail);
888 list_add_tail(&page_tail->lru, list); 890 list_add_tail(&page_tail->lru, list);
889 } else { 891 } else {
890 struct list_head *list_head; 892 struct list_head *list_head;
891 /* 893 /*
892 * Head page has not yet been counted, as an hpage, 894 * Head page has not yet been counted, as an hpage,
893 * so we must account for each subpage individually. 895 * so we must account for each subpage individually.
894 * 896 *
895 * Use the standard add function to put page_tail on the list, 897 * Use the standard add function to put page_tail on the list,
896 * but then correct its position so they all end up in order. 898 * but then correct its position so they all end up in order.
897 */ 899 */
898 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 900 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
899 list_head = page_tail->lru.prev; 901 list_head = page_tail->lru.prev;
900 list_move_tail(&page_tail->lru, list_head); 902 list_move_tail(&page_tail->lru, list_head);
901 } 903 }
902 904
903 if (!PageUnevictable(page)) 905 if (!PageUnevictable(page))
904 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 906 update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
905 } 907 }
906 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 908 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
907 909
908 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 910 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
909 void *arg) 911 void *arg)
910 { 912 {
911 int file = page_is_file_cache(page); 913 int file = page_is_file_cache(page);
912 int active = PageActive(page); 914 int active = PageActive(page);
913 enum lru_list lru = page_lru(page); 915 enum lru_list lru = page_lru(page);
914 916
915 VM_BUG_ON(PageLRU(page)); 917 VM_BUG_ON(PageLRU(page));
916 918
917 SetPageLRU(page); 919 SetPageLRU(page);
918 add_page_to_lru_list(page, lruvec, lru); 920 add_page_to_lru_list(page, lruvec, lru);
919 update_page_reclaim_stat(lruvec, file, active); 921 update_page_reclaim_stat(lruvec, file, active);
920 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 922 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
921 } 923 }
922 924
923 /* 925 /*
924 * Add the passed pages to the LRU, then drop the caller's refcount 926 * Add the passed pages to the LRU, then drop the caller's refcount
925 * on them. Reinitialises the caller's pagevec. 927 * on them. Reinitialises the caller's pagevec.
926 */ 928 */
927 void __pagevec_lru_add(struct pagevec *pvec) 929 void __pagevec_lru_add(struct pagevec *pvec)
928 { 930 {
929 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 931 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
930 } 932 }
931 EXPORT_SYMBOL(__pagevec_lru_add); 933 EXPORT_SYMBOL(__pagevec_lru_add);
932 934
933 /** 935 /**
934 * pagevec_lookup_entries - gang pagecache lookup 936 * pagevec_lookup_entries - gang pagecache lookup
935 * @pvec: Where the resulting entries are placed 937 * @pvec: Where the resulting entries are placed
936 * @mapping: The address_space to search 938 * @mapping: The address_space to search
937 * @start: The starting entry index 939 * @start: The starting entry index
938 * @nr_entries: The maximum number of entries 940 * @nr_entries: The maximum number of entries
939 * @indices: The cache indices corresponding to the entries in @pvec 941 * @indices: The cache indices corresponding to the entries in @pvec
940 * 942 *
941 * pagevec_lookup_entries() will search for and return a group of up 943 * pagevec_lookup_entries() will search for and return a group of up
942 * to @nr_entries pages and shadow entries in the mapping. All 944 * to @nr_entries pages and shadow entries in the mapping. All
943 * entries are placed in @pvec. pagevec_lookup_entries() takes a 945 * entries are placed in @pvec. pagevec_lookup_entries() takes a
944 * reference against actual pages in @pvec. 946 * reference against actual pages in @pvec.
945 * 947 *
946 * The search returns a group of mapping-contiguous entries with 948 * The search returns a group of mapping-contiguous entries with
947 * ascending indexes. There may be holes in the indices due to 949 * ascending indexes. There may be holes in the indices due to
948 * not-present entries. 950 * not-present entries.
949 * 951 *
950 * pagevec_lookup_entries() returns the number of entries which were 952 * pagevec_lookup_entries() returns the number of entries which were
951 * found. 953 * found.
952 */ 954 */
953 unsigned pagevec_lookup_entries(struct pagevec *pvec, 955 unsigned pagevec_lookup_entries(struct pagevec *pvec,
954 struct address_space *mapping, 956 struct address_space *mapping,
955 pgoff_t start, unsigned nr_pages, 957 pgoff_t start, unsigned nr_pages,
956 pgoff_t *indices) 958 pgoff_t *indices)
957 { 959 {
958 pvec->nr = find_get_entries(mapping, start, nr_pages, 960 pvec->nr = find_get_entries(mapping, start, nr_pages,
959 pvec->pages, indices); 961 pvec->pages, indices);
960 return pagevec_count(pvec); 962 return pagevec_count(pvec);
961 } 963 }
962 964
963 /** 965 /**
964 * pagevec_remove_exceptionals - pagevec exceptionals pruning 966 * pagevec_remove_exceptionals - pagevec exceptionals pruning
965 * @pvec: The pagevec to prune 967 * @pvec: The pagevec to prune
966 * 968 *
967 * pagevec_lookup_entries() fills both pages and exceptional radix 969 * pagevec_lookup_entries() fills both pages and exceptional radix
968 * tree entries into the pagevec. This function prunes all 970 * tree entries into the pagevec. This function prunes all
969 * exceptionals from @pvec without leaving holes, so that it can be 971 * exceptionals from @pvec without leaving holes, so that it can be
970 * passed on to page-only pagevec operations. 972 * passed on to page-only pagevec operations.
971 */ 973 */
972 void pagevec_remove_exceptionals(struct pagevec *pvec) 974 void pagevec_remove_exceptionals(struct pagevec *pvec)
973 { 975 {
974 int i, j; 976 int i, j;
975 977
976 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 978 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
977 struct page *page = pvec->pages[i]; 979 struct page *page = pvec->pages[i];
978 if (!radix_tree_exceptional_entry(page)) 980 if (!radix_tree_exceptional_entry(page))
979 pvec->pages[j++] = page; 981 pvec->pages[j++] = page;
980 } 982 }
981 pvec->nr = j; 983 pvec->nr = j;
982 } 984 }
983 985
984 /** 986 /**
985 * pagevec_lookup - gang pagecache lookup 987 * pagevec_lookup - gang pagecache lookup
986 * @pvec: Where the resulting pages are placed 988 * @pvec: Where the resulting pages are placed
987 * @mapping: The address_space to search 989 * @mapping: The address_space to search
988 * @start: The starting page index 990 * @start: The starting page index
989 * @nr_pages: The maximum number of pages 991 * @nr_pages: The maximum number of pages
990 * 992 *
991 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 993 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
992 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 994 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
993 * reference against the pages in @pvec. 995 * reference against the pages in @pvec.
994 * 996 *
995 * The search returns a group of mapping-contiguous pages with ascending 997 * The search returns a group of mapping-contiguous pages with ascending
996 * indexes. There may be holes in the indices due to not-present pages. 998 * indexes. There may be holes in the indices due to not-present pages.
997 * 999 *
998 * pagevec_lookup() returns the number of pages which were found. 1000 * pagevec_lookup() returns the number of pages which were found.
999 */ 1001 */
1000 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 1002 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
1001 pgoff_t start, unsigned nr_pages) 1003 pgoff_t start, unsigned nr_pages)
1002 { 1004 {
1003 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 1005 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
1004 return pagevec_count(pvec); 1006 return pagevec_count(pvec);
1005 } 1007 }
1006 EXPORT_SYMBOL(pagevec_lookup); 1008 EXPORT_SYMBOL(pagevec_lookup);
1007 1009
1008 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 1010 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
1009 pgoff_t *index, int tag, unsigned nr_pages) 1011 pgoff_t *index, int tag, unsigned nr_pages)
1010 { 1012 {
1011 pvec->nr = find_get_pages_tag(mapping, index, tag, 1013 pvec->nr = find_get_pages_tag(mapping, index, tag,
1012 nr_pages, pvec->pages); 1014 nr_pages, pvec->pages);
1013 return pagevec_count(pvec); 1015 return pagevec_count(pvec);
1014 } 1016 }
1015 EXPORT_SYMBOL(pagevec_lookup_tag); 1017 EXPORT_SYMBOL(pagevec_lookup_tag);
1016 1018
1017 /* 1019 /*
1018 * Perform any setup for the swap system 1020 * Perform any setup for the swap system
1019 */ 1021 */
1020 void __init swap_setup(void) 1022 void __init swap_setup(void)
1021 { 1023 {
1022 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1024 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
1023 #ifdef CONFIG_SWAP 1025 #ifdef CONFIG_SWAP
1024 int i; 1026 int i;
1025 1027
1026 bdi_init(swapper_spaces[0].backing_dev_info); 1028 bdi_init(swapper_spaces[0].backing_dev_info);
1027 for (i = 0; i < MAX_SWAPFILES; i++) { 1029 for (i = 0; i < MAX_SWAPFILES; i++) {
1028 spin_lock_init(&swapper_spaces[i].tree_lock); 1030 spin_lock_init(&swapper_spaces[i].tree_lock);
1029 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 1031 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
1030 } 1032 }
1031 #endif 1033 #endif
1032 1034
1033 /* Use a smaller cluster for small-memory machines */ 1035 /* Use a smaller cluster for small-memory machines */
1034 if (megs < 16) 1036 if (megs < 16)
1035 page_cluster = 2; 1037 page_cluster = 2;
1036 else 1038 else
1037 page_cluster = 3; 1039 page_cluster = 3;
1038 /* 1040 /*
1039 * Right now other parts of the system means that we 1041 * Right now other parts of the system means that we
1040 * _really_ don't want to cluster much more 1042 * _really_ don't want to cluster much more
1041 */ 1043 */
1042 } 1044 }
1043 1045