Commit 04e62a29bf157ce1edd168f2b71b533c80d13628

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent 442c9137de

[PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration.  Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up.  writepage() can drop the page
lock.  Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 124 additions and 43 deletions Inline Diff

include/linux/swap.h
1 #ifndef _LINUX_SWAP_H 1 #ifndef _LINUX_SWAP_H
2 #define _LINUX_SWAP_H 2 #define _LINUX_SWAP_H
3 3
4 #include <linux/spinlock.h> 4 #include <linux/spinlock.h>
5 #include <linux/linkage.h> 5 #include <linux/linkage.h>
6 #include <linux/mmzone.h> 6 #include <linux/mmzone.h>
7 #include <linux/list.h> 7 #include <linux/list.h>
8 #include <linux/sched.h> 8 #include <linux/sched.h>
9 9
10 #include <asm/atomic.h> 10 #include <asm/atomic.h>
11 #include <asm/page.h> 11 #include <asm/page.h>
12 12
13 #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ 13 #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
14 #define SWAP_FLAG_PRIO_MASK 0x7fff 14 #define SWAP_FLAG_PRIO_MASK 0x7fff
15 #define SWAP_FLAG_PRIO_SHIFT 0 15 #define SWAP_FLAG_PRIO_SHIFT 0
16 16
17 static inline int current_is_kswapd(void) 17 static inline int current_is_kswapd(void)
18 { 18 {
19 return current->flags & PF_KSWAPD; 19 return current->flags & PF_KSWAPD;
20 } 20 }
21 21
22 /* 22 /*
23 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can 23 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
24 * be swapped to. The swap type and the offset into that swap type are 24 * be swapped to. The swap type and the offset into that swap type are
25 * encoded into pte's and into pgoff_t's in the swapcache. Using five bits 25 * encoded into pte's and into pgoff_t's in the swapcache. Using five bits
26 * for the type means that the maximum number of swapcache pages is 27 bits 26 * for the type means that the maximum number of swapcache pages is 27 bits
27 * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs 27 * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs
28 * the type/offset into the pte as 5/27 as well. 28 * the type/offset into the pte as 5/27 as well.
29 */ 29 */
30 #define MAX_SWAPFILES_SHIFT 5 30 #define MAX_SWAPFILES_SHIFT 5
31 #ifndef CONFIG_MIGRATION 31 #ifndef CONFIG_MIGRATION
32 #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) 32 #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT)
33 #else 33 #else
34 /* Use last two entries for page migration swap entries */ 34 /* Use last two entries for page migration swap entries */
35 #define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) 35 #define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2)
36 #define SWP_MIGRATION_READ MAX_SWAPFILES 36 #define SWP_MIGRATION_READ MAX_SWAPFILES
37 #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) 37 #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1)
38 #endif 38 #endif
39 39
40 /* 40 /*
41 * Magic header for a swap area. The first part of the union is 41 * Magic header for a swap area. The first part of the union is
42 * what the swap magic looks like for the old (limited to 128MB) 42 * what the swap magic looks like for the old (limited to 128MB)
43 * swap area format, the second part of the union adds - in the 43 * swap area format, the second part of the union adds - in the
44 * old reserved area - some extra information. Note that the first 44 * old reserved area - some extra information. Note that the first
45 * kilobyte is reserved for boot loader or disk label stuff... 45 * kilobyte is reserved for boot loader or disk label stuff...
46 * 46 *
47 * Having the magic at the end of the PAGE_SIZE makes detecting swap 47 * Having the magic at the end of the PAGE_SIZE makes detecting swap
48 * areas somewhat tricky on machines that support multiple page sizes. 48 * areas somewhat tricky on machines that support multiple page sizes.
49 * For 2.5 we'll probably want to move the magic to just beyond the 49 * For 2.5 we'll probably want to move the magic to just beyond the
50 * bootbits... 50 * bootbits...
51 */ 51 */
52 union swap_header { 52 union swap_header {
53 struct { 53 struct {
54 char reserved[PAGE_SIZE - 10]; 54 char reserved[PAGE_SIZE - 10];
55 char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ 55 char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */
56 } magic; 56 } magic;
57 struct { 57 struct {
58 char bootbits[1024]; /* Space for disklabel etc. */ 58 char bootbits[1024]; /* Space for disklabel etc. */
59 __u32 version; 59 __u32 version;
60 __u32 last_page; 60 __u32 last_page;
61 __u32 nr_badpages; 61 __u32 nr_badpages;
62 unsigned char sws_uuid[16]; 62 unsigned char sws_uuid[16];
63 unsigned char sws_volume[16]; 63 unsigned char sws_volume[16];
64 __u32 padding[117]; 64 __u32 padding[117];
65 __u32 badpages[1]; 65 __u32 badpages[1];
66 } info; 66 } info;
67 }; 67 };
68 68
69 /* A swap entry has to fit into a "unsigned long", as 69 /* A swap entry has to fit into a "unsigned long", as
70 * the entry is hidden in the "index" field of the 70 * the entry is hidden in the "index" field of the
71 * swapper address space. 71 * swapper address space.
72 */ 72 */
73 typedef struct { 73 typedef struct {
74 unsigned long val; 74 unsigned long val;
75 } swp_entry_t; 75 } swp_entry_t;
76 76
77 /* 77 /*
78 * current->reclaim_state points to one of these when a task is running 78 * current->reclaim_state points to one of these when a task is running
79 * memory reclaim 79 * memory reclaim
80 */ 80 */
81 struct reclaim_state { 81 struct reclaim_state {
82 unsigned long reclaimed_slab; 82 unsigned long reclaimed_slab;
83 }; 83 };
84 84
85 #ifdef __KERNEL__ 85 #ifdef __KERNEL__
86 86
87 struct address_space; 87 struct address_space;
88 struct sysinfo; 88 struct sysinfo;
89 struct writeback_control; 89 struct writeback_control;
90 struct zone; 90 struct zone;
91 91
92 /* 92 /*
93 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of 93 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
94 * disk blocks. A list of swap extents maps the entire swapfile. (Where the 94 * disk blocks. A list of swap extents maps the entire swapfile. (Where the
95 * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart 95 * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart
96 * from setup, they're handled identically. 96 * from setup, they're handled identically.
97 * 97 *
98 * We always assume that blocks are of size PAGE_SIZE. 98 * We always assume that blocks are of size PAGE_SIZE.
99 */ 99 */
100 struct swap_extent { 100 struct swap_extent {
101 struct list_head list; 101 struct list_head list;
102 pgoff_t start_page; 102 pgoff_t start_page;
103 pgoff_t nr_pages; 103 pgoff_t nr_pages;
104 sector_t start_block; 104 sector_t start_block;
105 }; 105 };
106 106
107 /* 107 /*
108 * Max bad pages in the new format.. 108 * Max bad pages in the new format..
109 */ 109 */
110 #define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) 110 #define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
111 #define MAX_SWAP_BADPAGES \ 111 #define MAX_SWAP_BADPAGES \
112 ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) 112 ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
113 113
114 enum { 114 enum {
115 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ 115 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
116 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ 116 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
117 SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), 117 SWP_ACTIVE = (SWP_USED | SWP_WRITEOK),
118 /* add others here before... */ 118 /* add others here before... */
119 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ 119 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
120 }; 120 };
121 121
122 #define SWAP_CLUSTER_MAX 32 122 #define SWAP_CLUSTER_MAX 32
123 123
124 #define SWAP_MAP_MAX 0x7fff 124 #define SWAP_MAP_MAX 0x7fff
125 #define SWAP_MAP_BAD 0x8000 125 #define SWAP_MAP_BAD 0x8000
126 126
127 /* 127 /*
128 * The in-memory structure used to track swap areas. 128 * The in-memory structure used to track swap areas.
129 */ 129 */
130 struct swap_info_struct { 130 struct swap_info_struct {
131 unsigned int flags; 131 unsigned int flags;
132 int prio; /* swap priority */ 132 int prio; /* swap priority */
133 struct file *swap_file; 133 struct file *swap_file;
134 struct block_device *bdev; 134 struct block_device *bdev;
135 struct list_head extent_list; 135 struct list_head extent_list;
136 struct swap_extent *curr_swap_extent; 136 struct swap_extent *curr_swap_extent;
137 unsigned old_block_size; 137 unsigned old_block_size;
138 unsigned short * swap_map; 138 unsigned short * swap_map;
139 unsigned int lowest_bit; 139 unsigned int lowest_bit;
140 unsigned int highest_bit; 140 unsigned int highest_bit;
141 unsigned int cluster_next; 141 unsigned int cluster_next;
142 unsigned int cluster_nr; 142 unsigned int cluster_nr;
143 unsigned int pages; 143 unsigned int pages;
144 unsigned int max; 144 unsigned int max;
145 unsigned int inuse_pages; 145 unsigned int inuse_pages;
146 int next; /* next entry on swap list */ 146 int next; /* next entry on swap list */
147 }; 147 };
148 148
149 struct swap_list_t { 149 struct swap_list_t {
150 int head; /* head of priority-ordered swapfile list */ 150 int head; /* head of priority-ordered swapfile list */
151 int next; /* swapfile to be used next */ 151 int next; /* swapfile to be used next */
152 }; 152 };
153 153
154 /* Swap 50% full? Release swapcache more aggressively.. */ 154 /* Swap 50% full? Release swapcache more aggressively.. */
155 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) 155 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
156 156
157 /* linux/mm/oom_kill.c */ 157 /* linux/mm/oom_kill.c */
158 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); 158 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
159 159
160 /* linux/mm/memory.c */ 160 /* linux/mm/memory.c */
161 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); 161 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
162 162
163 /* linux/mm/page_alloc.c */ 163 /* linux/mm/page_alloc.c */
164 extern unsigned long totalram_pages; 164 extern unsigned long totalram_pages;
165 extern unsigned long totalhigh_pages; 165 extern unsigned long totalhigh_pages;
166 extern unsigned long totalreserve_pages; 166 extern unsigned long totalreserve_pages;
167 extern long nr_swap_pages; 167 extern long nr_swap_pages;
168 extern unsigned int nr_free_pages(void); 168 extern unsigned int nr_free_pages(void);
169 extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); 169 extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat);
170 extern unsigned int nr_free_buffer_pages(void); 170 extern unsigned int nr_free_buffer_pages(void);
171 extern unsigned int nr_free_pagecache_pages(void); 171 extern unsigned int nr_free_pagecache_pages(void);
172 172
173 /* linux/mm/swap.c */ 173 /* linux/mm/swap.c */
174 extern void FASTCALL(lru_cache_add(struct page *)); 174 extern void FASTCALL(lru_cache_add(struct page *));
175 extern void FASTCALL(lru_cache_add_active(struct page *)); 175 extern void FASTCALL(lru_cache_add_active(struct page *));
176 extern void FASTCALL(activate_page(struct page *)); 176 extern void FASTCALL(activate_page(struct page *));
177 extern void FASTCALL(mark_page_accessed(struct page *)); 177 extern void FASTCALL(mark_page_accessed(struct page *));
178 extern void lru_add_drain(void); 178 extern void lru_add_drain(void);
179 extern int lru_add_drain_all(void); 179 extern int lru_add_drain_all(void);
180 extern int rotate_reclaimable_page(struct page *page); 180 extern int rotate_reclaimable_page(struct page *page);
181 extern void swap_setup(void); 181 extern void swap_setup(void);
182 182
183 /* linux/mm/vmscan.c */ 183 /* linux/mm/vmscan.c */
184 extern unsigned long try_to_free_pages(struct zone **, gfp_t); 184 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
185 extern unsigned long shrink_all_memory(unsigned long nr_pages); 185 extern unsigned long shrink_all_memory(unsigned long nr_pages);
186 extern int vm_swappiness; 186 extern int vm_swappiness;
187 extern int remove_mapping(struct address_space *mapping, struct page *page); 187 extern int remove_mapping(struct address_space *mapping, struct page *page);
188 188
189 /* possible outcome of pageout() */
190 typedef enum {
191 /* failed to write page out, page is locked */
192 PAGE_KEEP,
193 /* move page to the active list, page is locked */
194 PAGE_ACTIVATE,
195 /* page has been sent to the disk successfully, page is unlocked */
196 PAGE_SUCCESS,
197 /* page is clean and locked */
198 PAGE_CLEAN,
199 } pageout_t;
200
201 extern pageout_t pageout(struct page *page, struct address_space *mapping);
202
203 #ifdef CONFIG_NUMA 189 #ifdef CONFIG_NUMA
204 extern int zone_reclaim_mode; 190 extern int zone_reclaim_mode;
205 extern int zone_reclaim_interval; 191 extern int zone_reclaim_interval;
206 extern int zone_reclaim(struct zone *, gfp_t, unsigned int); 192 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
207 #else 193 #else
208 #define zone_reclaim_mode 0 194 #define zone_reclaim_mode 0
209 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) 195 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
210 { 196 {
211 return 0; 197 return 0;
212 } 198 }
213 #endif 199 #endif
214 200
215 #ifdef CONFIG_MMU 201 #ifdef CONFIG_MMU
216 /* linux/mm/shmem.c */ 202 /* linux/mm/shmem.c */
217 extern int shmem_unuse(swp_entry_t entry, struct page *page); 203 extern int shmem_unuse(swp_entry_t entry, struct page *page);
218 #endif /* CONFIG_MMU */ 204 #endif /* CONFIG_MMU */
219 205
220 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); 206 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
221 207
222 #ifdef CONFIG_SWAP 208 #ifdef CONFIG_SWAP
223 /* linux/mm/page_io.c */ 209 /* linux/mm/page_io.c */
224 extern int swap_readpage(struct file *, struct page *); 210 extern int swap_readpage(struct file *, struct page *);
225 extern int swap_writepage(struct page *page, struct writeback_control *wbc); 211 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
226 extern int rw_swap_page_sync(int, swp_entry_t, struct page *); 212 extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
227 213
228 /* linux/mm/swap_state.c */ 214 /* linux/mm/swap_state.c */
229 extern struct address_space swapper_space; 215 extern struct address_space swapper_space;
230 #define total_swapcache_pages swapper_space.nrpages 216 #define total_swapcache_pages swapper_space.nrpages
231 extern void show_swap_cache_info(void); 217 extern void show_swap_cache_info(void);
232 extern int add_to_swap(struct page *, gfp_t); 218 extern int add_to_swap(struct page *, gfp_t);
233 extern void __delete_from_swap_cache(struct page *); 219 extern void __delete_from_swap_cache(struct page *);
234 extern void delete_from_swap_cache(struct page *); 220 extern void delete_from_swap_cache(struct page *);
235 extern int move_to_swap_cache(struct page *, swp_entry_t); 221 extern int move_to_swap_cache(struct page *, swp_entry_t);
236 extern int move_from_swap_cache(struct page *, unsigned long, 222 extern int move_from_swap_cache(struct page *, unsigned long,
237 struct address_space *); 223 struct address_space *);
238 extern void free_page_and_swap_cache(struct page *); 224 extern void free_page_and_swap_cache(struct page *);
239 extern void free_pages_and_swap_cache(struct page **, int); 225 extern void free_pages_and_swap_cache(struct page **, int);
240 extern struct page * lookup_swap_cache(swp_entry_t); 226 extern struct page * lookup_swap_cache(swp_entry_t);
241 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, 227 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
242 unsigned long addr); 228 unsigned long addr);
243 /* linux/mm/swapfile.c */ 229 /* linux/mm/swapfile.c */
244 extern long total_swap_pages; 230 extern long total_swap_pages;
245 extern unsigned int nr_swapfiles; 231 extern unsigned int nr_swapfiles;
246 extern void si_swapinfo(struct sysinfo *); 232 extern void si_swapinfo(struct sysinfo *);
247 extern swp_entry_t get_swap_page(void); 233 extern swp_entry_t get_swap_page(void);
248 extern swp_entry_t get_swap_page_of_type(int); 234 extern swp_entry_t get_swap_page_of_type(int);
249 extern int swap_duplicate(swp_entry_t); 235 extern int swap_duplicate(swp_entry_t);
250 extern int valid_swaphandles(swp_entry_t, unsigned long *); 236 extern int valid_swaphandles(swp_entry_t, unsigned long *);
251 extern void swap_free(swp_entry_t); 237 extern void swap_free(swp_entry_t);
252 extern void free_swap_and_cache(swp_entry_t); 238 extern void free_swap_and_cache(swp_entry_t);
253 extern int swap_type_of(dev_t); 239 extern int swap_type_of(dev_t);
254 extern unsigned int count_swap_pages(int, int); 240 extern unsigned int count_swap_pages(int, int);
255 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); 241 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
256 extern struct swap_info_struct *get_swap_info_struct(unsigned); 242 extern struct swap_info_struct *get_swap_info_struct(unsigned);
257 extern int can_share_swap_page(struct page *); 243 extern int can_share_swap_page(struct page *);
258 extern int remove_exclusive_swap_page(struct page *); 244 extern int remove_exclusive_swap_page(struct page *);
259 struct backing_dev_info; 245 struct backing_dev_info;
260 246
261 extern spinlock_t swap_lock; 247 extern spinlock_t swap_lock;
262 extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
263 248
264 /* linux/mm/thrash.c */ 249 /* linux/mm/thrash.c */
265 extern struct mm_struct * swap_token_mm; 250 extern struct mm_struct * swap_token_mm;
266 extern unsigned long swap_token_default_timeout; 251 extern unsigned long swap_token_default_timeout;
267 extern void grab_swap_token(void); 252 extern void grab_swap_token(void);
268 extern void __put_swap_token(struct mm_struct *); 253 extern void __put_swap_token(struct mm_struct *);
269 254
270 static inline int has_swap_token(struct mm_struct *mm) 255 static inline int has_swap_token(struct mm_struct *mm)
271 { 256 {
272 return (mm == swap_token_mm); 257 return (mm == swap_token_mm);
273 } 258 }
274 259
275 static inline void put_swap_token(struct mm_struct *mm) 260 static inline void put_swap_token(struct mm_struct *mm)
276 { 261 {
277 if (has_swap_token(mm)) 262 if (has_swap_token(mm))
278 __put_swap_token(mm); 263 __put_swap_token(mm);
279 } 264 }
280 265
281 static inline void disable_swap_token(void) 266 static inline void disable_swap_token(void)
282 { 267 {
283 put_swap_token(swap_token_mm); 268 put_swap_token(swap_token_mm);
284 } 269 }
285 270
286 #else /* CONFIG_SWAP */ 271 #else /* CONFIG_SWAP */
287 272
288 #define total_swap_pages 0 273 #define total_swap_pages 0
289 #define total_swapcache_pages 0UL 274 #define total_swapcache_pages 0UL
290 275
291 #define si_swapinfo(val) \ 276 #define si_swapinfo(val) \
292 do { (val)->freeswap = (val)->totalswap = 0; } while (0) 277 do { (val)->freeswap = (val)->totalswap = 0; } while (0)
293 /* only sparc can not include linux/pagemap.h in this file 278 /* only sparc can not include linux/pagemap.h in this file
294 * so leave page_cache_release and release_pages undeclared... */ 279 * so leave page_cache_release and release_pages undeclared... */
295 #define free_page_and_swap_cache(page) \ 280 #define free_page_and_swap_cache(page) \
296 page_cache_release(page) 281 page_cache_release(page)
297 #define free_pages_and_swap_cache(pages, nr) \ 282 #define free_pages_and_swap_cache(pages, nr) \
298 release_pages((pages), (nr), 0); 283 release_pages((pages), (nr), 0);
299 284
300 #define show_swap_cache_info() /*NOTHING*/ 285 #define show_swap_cache_info() /*NOTHING*/
301 #define free_swap_and_cache(swp) /*NOTHING*/ 286 #define free_swap_and_cache(swp) /*NOTHING*/
302 #define swap_duplicate(swp) /*NOTHING*/ 287 #define swap_duplicate(swp) /*NOTHING*/
303 #define swap_free(swp) /*NOTHING*/ 288 #define swap_free(swp) /*NOTHING*/
304 #define read_swap_cache_async(swp,vma,addr) NULL 289 #define read_swap_cache_async(swp,vma,addr) NULL
305 #define lookup_swap_cache(swp) NULL 290 #define lookup_swap_cache(swp) NULL
306 #define valid_swaphandles(swp, off) 0 291 #define valid_swaphandles(swp, off) 0
307 #define can_share_swap_page(p) (page_mapcount(p) == 1) 292 #define can_share_swap_page(p) (page_mapcount(p) == 1)
308 #define move_to_swap_cache(p, swp) 1 293 #define move_to_swap_cache(p, swp) 1
309 #define move_from_swap_cache(p, i, m) 1 294 #define move_from_swap_cache(p, i, m) 1
310 #define __delete_from_swap_cache(p) /*NOTHING*/ 295 #define __delete_from_swap_cache(p) /*NOTHING*/
311 #define delete_from_swap_cache(p) /*NOTHING*/ 296 #define delete_from_swap_cache(p) /*NOTHING*/
312 #define swap_token_default_timeout 0 297 #define swap_token_default_timeout 0
313 298
314 static inline int remove_exclusive_swap_page(struct page *p) 299 static inline int remove_exclusive_swap_page(struct page *p)
315 { 300 {
316 return 0; 301 return 0;
317 } 302 }
318 303
319 static inline swp_entry_t get_swap_page(void) 304 static inline swp_entry_t get_swap_page(void)
320 { 305 {
321 swp_entry_t entry; 306 swp_entry_t entry;
322 entry.val = 0; 307 entry.val = 0;
323 return entry; 308 return entry;
324 } 309 }
325 310
326 /* linux/mm/thrash.c */ 311 /* linux/mm/thrash.c */
327 #define put_swap_token(x) do { } while(0) 312 #define put_swap_token(x) do { } while(0)
328 #define grab_swap_token() do { } while(0) 313 #define grab_swap_token() do { } while(0)
329 #define has_swap_token(x) 0 314 #define has_swap_token(x) 0
330 #define disable_swap_token() do { } while(0) 315 #define disable_swap_token() do { } while(0)
331 316
332 #endif /* CONFIG_SWAP */ 317 #endif /* CONFIG_SWAP */
333 #endif /* __KERNEL__*/ 318 #endif /* __KERNEL__*/
334 #endif /* _LINUX_SWAP_H */ 319 #endif /* _LINUX_SWAP_H */
335 320
1 /* 1 /*
2 * Memory Migration functionality - linux/mm/migration.c 2 * Memory Migration functionality - linux/mm/migration.c
3 * 3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 * 5 *
6 * Page migration was first developed in the context of the memory hotplug 6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are: 7 * project. The main authors of the migration code are:
8 * 8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com> 11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com> 12 * Christoph Lameter <clameter@sgi.com>
13 */ 13 */
14 14
15 #include <linux/migrate.h> 15 #include <linux/migrate.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/swap.h> 17 #include <linux/swap.h>
18 #include <linux/swapops.h> 18 #include <linux/swapops.h>
19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h>
20 #include <linux/buffer_head.h> 20 #include <linux/buffer_head.h>
21 #include <linux/mm_inline.h> 21 #include <linux/mm_inline.h>
22 #include <linux/pagevec.h> 22 #include <linux/pagevec.h>
23 #include <linux/rmap.h> 23 #include <linux/rmap.h>
24 #include <linux/topology.h> 24 #include <linux/topology.h>
25 #include <linux/cpu.h> 25 #include <linux/cpu.h>
26 #include <linux/cpuset.h> 26 #include <linux/cpuset.h>
27 #include <linux/writeback.h>
27 28
28 #include "internal.h" 29 #include "internal.h"
29 30
30 /* The maximum number of pages to take off the LRU for migration */ 31 /* The maximum number of pages to take off the LRU for migration */
31 #define MIGRATE_CHUNK_SIZE 256 32 #define MIGRATE_CHUNK_SIZE 256
32 33
33 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 34 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
34 35
35 /* 36 /*
36 * Isolate one page from the LRU lists. If successful put it onto 37 * Isolate one page from the LRU lists. If successful put it onto
37 * the indicated list with elevated page count. 38 * the indicated list with elevated page count.
38 * 39 *
39 * Result: 40 * Result:
40 * -EBUSY: page not on LRU list 41 * -EBUSY: page not on LRU list
41 * 0: page removed from LRU list and added to the specified list. 42 * 0: page removed from LRU list and added to the specified list.
42 */ 43 */
43 int isolate_lru_page(struct page *page, struct list_head *pagelist) 44 int isolate_lru_page(struct page *page, struct list_head *pagelist)
44 { 45 {
45 int ret = -EBUSY; 46 int ret = -EBUSY;
46 47
47 if (PageLRU(page)) { 48 if (PageLRU(page)) {
48 struct zone *zone = page_zone(page); 49 struct zone *zone = page_zone(page);
49 50
50 spin_lock_irq(&zone->lru_lock); 51 spin_lock_irq(&zone->lru_lock);
51 if (PageLRU(page)) { 52 if (PageLRU(page)) {
52 ret = 0; 53 ret = 0;
53 get_page(page); 54 get_page(page);
54 ClearPageLRU(page); 55 ClearPageLRU(page);
55 if (PageActive(page)) 56 if (PageActive(page))
56 del_page_from_active_list(zone, page); 57 del_page_from_active_list(zone, page);
57 else 58 else
58 del_page_from_inactive_list(zone, page); 59 del_page_from_inactive_list(zone, page);
59 list_add_tail(&page->lru, pagelist); 60 list_add_tail(&page->lru, pagelist);
60 } 61 }
61 spin_unlock_irq(&zone->lru_lock); 62 spin_unlock_irq(&zone->lru_lock);
62 } 63 }
63 return ret; 64 return ret;
64 } 65 }
65 66
66 /* 67 /*
67 * migrate_prep() needs to be called after we have compiled the list of pages 68 * migrate_prep() needs to be called after we have compiled the list of pages
68 * to be migrated using isolate_lru_page() but before we begin a series of calls 69 * to be migrated using isolate_lru_page() but before we begin a series of calls
69 * to migrate_pages(). 70 * to migrate_pages().
70 */ 71 */
71 int migrate_prep(void) 72 int migrate_prep(void)
72 { 73 {
73 /* 74 /*
74 * Clear the LRU lists so pages can be isolated. 75 * Clear the LRU lists so pages can be isolated.
75 * Note that pages may be moved off the LRU after we have 76 * Note that pages may be moved off the LRU after we have
76 * drained them. Those pages will fail to migrate like other 77 * drained them. Those pages will fail to migrate like other
77 * pages that may be busy. 78 * pages that may be busy.
78 */ 79 */
79 lru_add_drain_all(); 80 lru_add_drain_all();
80 81
81 return 0; 82 return 0;
82 } 83 }
83 84
84 static inline void move_to_lru(struct page *page) 85 static inline void move_to_lru(struct page *page)
85 { 86 {
86 list_del(&page->lru); 87 list_del(&page->lru);
87 if (PageActive(page)) { 88 if (PageActive(page)) {
88 /* 89 /*
89 * lru_cache_add_active checks that 90 * lru_cache_add_active checks that
90 * the PG_active bit is off. 91 * the PG_active bit is off.
91 */ 92 */
92 ClearPageActive(page); 93 ClearPageActive(page);
93 lru_cache_add_active(page); 94 lru_cache_add_active(page);
94 } else { 95 } else {
95 lru_cache_add(page); 96 lru_cache_add(page);
96 } 97 }
97 put_page(page); 98 put_page(page);
98 } 99 }
99 100
100 /* 101 /*
101 * Add isolated pages on the list back to the LRU. 102 * Add isolated pages on the list back to the LRU.
102 * 103 *
103 * returns the number of pages put back. 104 * returns the number of pages put back.
104 */ 105 */
105 int putback_lru_pages(struct list_head *l) 106 int putback_lru_pages(struct list_head *l)
106 { 107 {
107 struct page *page; 108 struct page *page;
108 struct page *page2; 109 struct page *page2;
109 int count = 0; 110 int count = 0;
110 111
111 list_for_each_entry_safe(page, page2, l, lru) { 112 list_for_each_entry_safe(page, page2, l, lru) {
112 move_to_lru(page); 113 move_to_lru(page);
113 count++; 114 count++;
114 } 115 }
115 return count; 116 return count;
116 } 117 }
117 118
118 static inline int is_swap_pte(pte_t pte) 119 static inline int is_swap_pte(pte_t pte)
119 { 120 {
120 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); 121 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
121 } 122 }
122 123
123 /* 124 /*
124 * Restore a potential migration pte to a working pte entry 125 * Restore a potential migration pte to a working pte entry
125 */ 126 */
126 static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, 127 static void remove_migration_pte(struct vm_area_struct *vma,
127 struct page *old, struct page *new) 128 struct page *old, struct page *new)
128 { 129 {
129 struct mm_struct *mm = vma->vm_mm; 130 struct mm_struct *mm = vma->vm_mm;
130 swp_entry_t entry; 131 swp_entry_t entry;
131 pgd_t *pgd; 132 pgd_t *pgd;
132 pud_t *pud; 133 pud_t *pud;
133 pmd_t *pmd; 134 pmd_t *pmd;
134 pte_t *ptep, pte; 135 pte_t *ptep, pte;
135 spinlock_t *ptl; 136 spinlock_t *ptl;
137 unsigned long addr = page_address_in_vma(new, vma);
136 138
139 if (addr == -EFAULT)
140 return;
141
137 pgd = pgd_offset(mm, addr); 142 pgd = pgd_offset(mm, addr);
138 if (!pgd_present(*pgd)) 143 if (!pgd_present(*pgd))
139 return; 144 return;
140 145
141 pud = pud_offset(pgd, addr); 146 pud = pud_offset(pgd, addr);
142 if (!pud_present(*pud)) 147 if (!pud_present(*pud))
143 return; 148 return;
144 149
145 pmd = pmd_offset(pud, addr); 150 pmd = pmd_offset(pud, addr);
146 if (!pmd_present(*pmd)) 151 if (!pmd_present(*pmd))
147 return; 152 return;
148 153
149 ptep = pte_offset_map(pmd, addr); 154 ptep = pte_offset_map(pmd, addr);
150 155
151 if (!is_swap_pte(*ptep)) { 156 if (!is_swap_pte(*ptep)) {
152 pte_unmap(ptep); 157 pte_unmap(ptep);
153 return; 158 return;
154 } 159 }
155 160
156 ptl = pte_lockptr(mm, pmd); 161 ptl = pte_lockptr(mm, pmd);
157 spin_lock(ptl); 162 spin_lock(ptl);
158 pte = *ptep; 163 pte = *ptep;
159 if (!is_swap_pte(pte)) 164 if (!is_swap_pte(pte))
160 goto out; 165 goto out;
161 166
162 entry = pte_to_swp_entry(pte); 167 entry = pte_to_swp_entry(pte);
163 168
164 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 169 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
165 goto out; 170 goto out;
166 171
167 get_page(new); 172 get_page(new);
168 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 173 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
169 if (is_write_migration_entry(entry)) 174 if (is_write_migration_entry(entry))
170 pte = pte_mkwrite(pte); 175 pte = pte_mkwrite(pte);
171 set_pte_at(mm, addr, ptep, pte); 176 set_pte_at(mm, addr, ptep, pte);
172 page_add_anon_rmap(new, vma, addr); 177
178 if (PageAnon(new))
179 page_add_anon_rmap(new, vma, addr);
180 else
181 page_add_file_rmap(new);
182
183 /* No need to invalidate - it was non-present before */
184 update_mmu_cache(vma, addr, pte);
185 lazy_mmu_prot_update(pte);
186
173 out: 187 out:
174 pte_unmap_unlock(ptep, ptl); 188 pte_unmap_unlock(ptep, ptl);
175 } 189 }
176 190
177 /* 191 /*
178 * Get rid of all migration entries and replace them by 192 * Note that remove_file_migration_ptes will only work on regular mappings,
179 * references to the indicated page. 193 * Nonlinear mappings do not use migration entries.
180 * 194 */
195 static void remove_file_migration_ptes(struct page *old, struct page *new)
196 {
197 struct vm_area_struct *vma;
198 struct address_space *mapping = page_mapping(new);
199 struct prio_tree_iter iter;
200 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
201
202 if (!mapping)
203 return;
204
205 spin_lock(&mapping->i_mmap_lock);
206
207 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
208 remove_migration_pte(vma, old, new);
209
210 spin_unlock(&mapping->i_mmap_lock);
211 }
212
213 /*
181 * Must hold mmap_sem lock on at least one of the vmas containing 214 * Must hold mmap_sem lock on at least one of the vmas containing
182 * the page so that the anon_vma cannot vanish. 215 * the page so that the anon_vma cannot vanish.
183 */ 216 */
184 static void remove_migration_ptes(struct page *old, struct page *new) 217 static void remove_anon_migration_ptes(struct page *old, struct page *new)
185 { 218 {
186 struct anon_vma *anon_vma; 219 struct anon_vma *anon_vma;
187 struct vm_area_struct *vma; 220 struct vm_area_struct *vma;
188 unsigned long mapping; 221 unsigned long mapping;
189 222
190 mapping = (unsigned long)new->mapping; 223 mapping = (unsigned long)new->mapping;
191 224
192 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) 225 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
193 return; 226 return;
194 227
195 /* 228 /*
196 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. 229 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
197 */ 230 */
198 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); 231 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
199 spin_lock(&anon_vma->lock); 232 spin_lock(&anon_vma->lock);
200 233
201 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 234 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202 remove_migration_pte(vma, page_address_in_vma(new, vma), 235 remove_migration_pte(vma, old, new);
203 old, new);
204 236
205 spin_unlock(&anon_vma->lock); 237 spin_unlock(&anon_vma->lock);
206 } 238 }
207 239
208 /* 240 /*
241 * Get rid of all migration entries and replace them by
242 * references to the indicated page.
243 */
244 static void remove_migration_ptes(struct page *old, struct page *new)
245 {
246 if (PageAnon(new))
247 remove_anon_migration_ptes(old, new);
248 else
249 remove_file_migration_ptes(old, new);
250 }
251
252 /*
209 * Something used the pte of a page under migration. We need to 253 * Something used the pte of a page under migration. We need to
210 * get to the page and wait until migration is finished. 254 * get to the page and wait until migration is finished.
211 * When we return from this function the fault will be retried. 255 * When we return from this function the fault will be retried.
212 * 256 *
213 * This function is called from do_swap_page(). 257 * This function is called from do_swap_page().
214 */ 258 */
215 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 259 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
216 unsigned long address) 260 unsigned long address)
217 { 261 {
218 pte_t *ptep, pte; 262 pte_t *ptep, pte;
219 spinlock_t *ptl; 263 spinlock_t *ptl;
220 swp_entry_t entry; 264 swp_entry_t entry;
221 struct page *page; 265 struct page *page;
222 266
223 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 267 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
224 pte = *ptep; 268 pte = *ptep;
225 if (!is_swap_pte(pte)) 269 if (!is_swap_pte(pte))
226 goto out; 270 goto out;
227 271
228 entry = pte_to_swp_entry(pte); 272 entry = pte_to_swp_entry(pte);
229 if (!is_migration_entry(entry)) 273 if (!is_migration_entry(entry))
230 goto out; 274 goto out;
231 275
232 page = migration_entry_to_page(entry); 276 page = migration_entry_to_page(entry);
233 277
234 get_page(page); 278 get_page(page);
235 pte_unmap_unlock(ptep, ptl); 279 pte_unmap_unlock(ptep, ptl);
236 wait_on_page_locked(page); 280 wait_on_page_locked(page);
237 put_page(page); 281 put_page(page);
238 return; 282 return;
239 out: 283 out:
240 pte_unmap_unlock(ptep, ptl); 284 pte_unmap_unlock(ptep, ptl);
241 } 285 }
242 286
243 /* 287 /*
244 * Replace the page in the mapping. 288 * Replace the page in the mapping.
245 * 289 *
246 * The number of remaining references must be: 290 * The number of remaining references must be:
247 * 1 for anonymous pages without a mapping 291 * 1 for anonymous pages without a mapping
248 * 2 for pages with a mapping 292 * 2 for pages with a mapping
249 * 3 for pages with a mapping and PagePrivate set. 293 * 3 for pages with a mapping and PagePrivate set.
250 */ 294 */
251 static int migrate_page_move_mapping(struct address_space *mapping, 295 static int migrate_page_move_mapping(struct address_space *mapping,
252 struct page *newpage, struct page *page) 296 struct page *newpage, struct page *page)
253 { 297 {
254 struct page **radix_pointer; 298 struct page **radix_pointer;
255 299
256 if (!mapping) { 300 if (!mapping) {
257 /* Anonymous page */ 301 /* Anonymous page */
258 if (page_count(page) != 1) 302 if (page_count(page) != 1)
259 return -EAGAIN; 303 return -EAGAIN;
260 return 0; 304 return 0;
261 } 305 }
262 306
263 write_lock_irq(&mapping->tree_lock); 307 write_lock_irq(&mapping->tree_lock);
264 308
265 radix_pointer = (struct page **)radix_tree_lookup_slot( 309 radix_pointer = (struct page **)radix_tree_lookup_slot(
266 &mapping->page_tree, 310 &mapping->page_tree,
267 page_index(page)); 311 page_index(page));
268 312
269 if (page_count(page) != 2 + !!PagePrivate(page) || 313 if (page_count(page) != 2 + !!PagePrivate(page) ||
270 *radix_pointer != page) { 314 *radix_pointer != page) {
271 write_unlock_irq(&mapping->tree_lock); 315 write_unlock_irq(&mapping->tree_lock);
272 return -EAGAIN; 316 return -EAGAIN;
273 } 317 }
274 318
275 /* 319 /*
276 * Now we know that no one else is looking at the page. 320 * Now we know that no one else is looking at the page.
277 */ 321 */
278 get_page(newpage); 322 get_page(newpage);
279 #ifdef CONFIG_SWAP 323 #ifdef CONFIG_SWAP
280 if (PageSwapCache(page)) { 324 if (PageSwapCache(page)) {
281 SetPageSwapCache(newpage); 325 SetPageSwapCache(newpage);
282 set_page_private(newpage, page_private(page)); 326 set_page_private(newpage, page_private(page));
283 } 327 }
284 #endif 328 #endif
285 329
286 *radix_pointer = newpage; 330 *radix_pointer = newpage;
287 __put_page(page); 331 __put_page(page);
288 write_unlock_irq(&mapping->tree_lock); 332 write_unlock_irq(&mapping->tree_lock);
289 333
290 return 0; 334 return 0;
291 } 335 }
292 336
293 /* 337 /*
294 * Copy the page to its new location 338 * Copy the page to its new location
295 */ 339 */
296 static void migrate_page_copy(struct page *newpage, struct page *page) 340 static void migrate_page_copy(struct page *newpage, struct page *page)
297 { 341 {
298 copy_highpage(newpage, page); 342 copy_highpage(newpage, page);
299 343
300 if (PageError(page)) 344 if (PageError(page))
301 SetPageError(newpage); 345 SetPageError(newpage);
302 if (PageReferenced(page)) 346 if (PageReferenced(page))
303 SetPageReferenced(newpage); 347 SetPageReferenced(newpage);
304 if (PageUptodate(page)) 348 if (PageUptodate(page))
305 SetPageUptodate(newpage); 349 SetPageUptodate(newpage);
306 if (PageActive(page)) 350 if (PageActive(page))
307 SetPageActive(newpage); 351 SetPageActive(newpage);
308 if (PageChecked(page)) 352 if (PageChecked(page))
309 SetPageChecked(newpage); 353 SetPageChecked(newpage);
310 if (PageMappedToDisk(page)) 354 if (PageMappedToDisk(page))
311 SetPageMappedToDisk(newpage); 355 SetPageMappedToDisk(newpage);
312 356
313 if (PageDirty(page)) { 357 if (PageDirty(page)) {
314 clear_page_dirty_for_io(page); 358 clear_page_dirty_for_io(page);
315 set_page_dirty(newpage); 359 set_page_dirty(newpage);
316 } 360 }
317 361
318 #ifdef CONFIG_SWAP 362 #ifdef CONFIG_SWAP
319 ClearPageSwapCache(page); 363 ClearPageSwapCache(page);
320 #endif 364 #endif
321 ClearPageActive(page); 365 ClearPageActive(page);
322 ClearPagePrivate(page); 366 ClearPagePrivate(page);
323 set_page_private(page, 0); 367 set_page_private(page, 0);
324 page->mapping = NULL; 368 page->mapping = NULL;
325 369
326 /* 370 /*
327 * If any waiters have accumulated on the new page then 371 * If any waiters have accumulated on the new page then
328 * wake them up. 372 * wake them up.
329 */ 373 */
330 if (PageWriteback(newpage)) 374 if (PageWriteback(newpage))
331 end_page_writeback(newpage); 375 end_page_writeback(newpage);
332 } 376 }
333 377
334 /************************************************************ 378 /************************************************************
335 * Migration functions 379 * Migration functions
336 ***********************************************************/ 380 ***********************************************************/
337 381
338 /* Always fail migration. Used for mappings that are not movable */ 382 /* Always fail migration. Used for mappings that are not movable */
339 int fail_migrate_page(struct address_space *mapping, 383 int fail_migrate_page(struct address_space *mapping,
340 struct page *newpage, struct page *page) 384 struct page *newpage, struct page *page)
341 { 385 {
342 return -EIO; 386 return -EIO;
343 } 387 }
344 EXPORT_SYMBOL(fail_migrate_page); 388 EXPORT_SYMBOL(fail_migrate_page);
345 389
346 /* 390 /*
347 * Common logic to directly migrate a single page suitable for 391 * Common logic to directly migrate a single page suitable for
348 * pages that do not use PagePrivate. 392 * pages that do not use PagePrivate.
349 * 393 *
350 * Pages are locked upon entry and exit. 394 * Pages are locked upon entry and exit.
351 */ 395 */
352 int migrate_page(struct address_space *mapping, 396 int migrate_page(struct address_space *mapping,
353 struct page *newpage, struct page *page) 397 struct page *newpage, struct page *page)
354 { 398 {
355 int rc; 399 int rc;
356 400
357 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 401 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
358 402
359 rc = migrate_page_move_mapping(mapping, newpage, page); 403 rc = migrate_page_move_mapping(mapping, newpage, page);
360 404
361 if (rc) 405 if (rc)
362 return rc; 406 return rc;
363 407
364 migrate_page_copy(newpage, page); 408 migrate_page_copy(newpage, page);
365 return 0; 409 return 0;
366 } 410 }
367 EXPORT_SYMBOL(migrate_page); 411 EXPORT_SYMBOL(migrate_page);
368 412
369 /* 413 /*
370 * Migration function for pages with buffers. This function can only be used 414 * Migration function for pages with buffers. This function can only be used
371 * if the underlying filesystem guarantees that no other references to "page" 415 * if the underlying filesystem guarantees that no other references to "page"
372 * exist. 416 * exist.
373 */ 417 */
374 int buffer_migrate_page(struct address_space *mapping, 418 int buffer_migrate_page(struct address_space *mapping,
375 struct page *newpage, struct page *page) 419 struct page *newpage, struct page *page)
376 { 420 {
377 struct buffer_head *bh, *head; 421 struct buffer_head *bh, *head;
378 int rc; 422 int rc;
379 423
380 if (!page_has_buffers(page)) 424 if (!page_has_buffers(page))
381 return migrate_page(mapping, newpage, page); 425 return migrate_page(mapping, newpage, page);
382 426
383 head = page_buffers(page); 427 head = page_buffers(page);
384 428
385 rc = migrate_page_move_mapping(mapping, newpage, page); 429 rc = migrate_page_move_mapping(mapping, newpage, page);
386 430
387 if (rc) 431 if (rc)
388 return rc; 432 return rc;
389 433
390 bh = head; 434 bh = head;
391 do { 435 do {
392 get_bh(bh); 436 get_bh(bh);
393 lock_buffer(bh); 437 lock_buffer(bh);
394 bh = bh->b_this_page; 438 bh = bh->b_this_page;
395 439
396 } while (bh != head); 440 } while (bh != head);
397 441
398 ClearPagePrivate(page); 442 ClearPagePrivate(page);
399 set_page_private(newpage, page_private(page)); 443 set_page_private(newpage, page_private(page));
400 set_page_private(page, 0); 444 set_page_private(page, 0);
401 put_page(page); 445 put_page(page);
402 get_page(newpage); 446 get_page(newpage);
403 447
404 bh = head; 448 bh = head;
405 do { 449 do {
406 set_bh_page(bh, newpage, bh_offset(bh)); 450 set_bh_page(bh, newpage, bh_offset(bh));
407 bh = bh->b_this_page; 451 bh = bh->b_this_page;
408 452
409 } while (bh != head); 453 } while (bh != head);
410 454
411 SetPagePrivate(newpage); 455 SetPagePrivate(newpage);
412 456
413 migrate_page_copy(newpage, page); 457 migrate_page_copy(newpage, page);
414 458
415 bh = head; 459 bh = head;
416 do { 460 do {
417 unlock_buffer(bh); 461 unlock_buffer(bh);
418 put_bh(bh); 462 put_bh(bh);
419 bh = bh->b_this_page; 463 bh = bh->b_this_page;
420 464
421 } while (bh != head); 465 } while (bh != head);
422 466
423 return 0; 467 return 0;
424 } 468 }
425 EXPORT_SYMBOL(buffer_migrate_page); 469 EXPORT_SYMBOL(buffer_migrate_page);
426 470
427 static int fallback_migrate_page(struct address_space *mapping, 471 /*
428 struct page *newpage, struct page *page) 472 * Writeback a page to clean the dirty state
473 */
474 static int writeout(struct address_space *mapping, struct page *page)
429 { 475 {
476 struct writeback_control wbc = {
477 .sync_mode = WB_SYNC_NONE,
478 .nr_to_write = 1,
479 .range_start = 0,
480 .range_end = LLONG_MAX,
481 .nonblocking = 1,
482 .for_reclaim = 1
483 };
484 int rc;
485
486 if (!mapping->a_ops->writepage)
487 /* No write method for the address space */
488 return -EINVAL;
489
490 if (!clear_page_dirty_for_io(page))
491 /* Someone else already triggered a write */
492 return -EAGAIN;
493
430 /* 494 /*
431 * Default handling if a filesystem does not provide 495 * A dirty page may imply that the underlying filesystem has
432 * a migration function. We can only migrate clean 496 * the page on some queue. So the page must be clean for
433 * pages so try to write out any dirty pages first. 497 * migration. Writeout may mean we loose the lock and the
498 * page state is no longer what we checked for earlier.
499 * At this point we know that the migration attempt cannot
500 * be successful.
434 */ 501 */
435 if (PageDirty(page)) { 502 remove_migration_ptes(page, page);
436 switch (pageout(page, mapping)) {
437 case PAGE_KEEP:
438 case PAGE_ACTIVATE:
439 return -EAGAIN;
440 503
441 case PAGE_SUCCESS: 504 rc = mapping->a_ops->writepage(page, &wbc);
442 /* Relock since we lost the lock */ 505 if (rc < 0)
443 lock_page(page); 506 /* I/O Error writing */
444 /* Must retry since page state may have changed */ 507 return -EIO;
445 return -EAGAIN;
446 508
447 case PAGE_CLEAN: 509 if (rc != AOP_WRITEPAGE_ACTIVATE)
448 ; /* try to migrate the page below */ 510 /* unlocked. Relock */
449 } 511 lock_page(page);
450 } 512
513 return -EAGAIN;
514 }
515
516 /*
517 * Default handling if a filesystem does not provide a migration function.
518 */
519 static int fallback_migrate_page(struct address_space *mapping,
520 struct page *newpage, struct page *page)
521 {
522 if (PageDirty(page))
523 return writeout(mapping, page);
451 524
452 /* 525 /*
453 * Buffers may be managed in a filesystem specific way. 526 * Buffers may be managed in a filesystem specific way.
454 * We must have no buffers or drop them. 527 * We must have no buffers or drop them.
455 */ 528 */
456 if (page_has_buffers(page) && 529 if (page_has_buffers(page) &&
457 !try_to_release_page(page, GFP_KERNEL)) 530 !try_to_release_page(page, GFP_KERNEL))
458 return -EAGAIN; 531 return -EAGAIN;
459 532
460 return migrate_page(mapping, newpage, page); 533 return migrate_page(mapping, newpage, page);
461 } 534 }
462 535
463 /* 536 /*
464 * migrate_pages 537 * migrate_pages
465 * 538 *
466 * Two lists are passed to this function. The first list 539 * Two lists are passed to this function. The first list
467 * contains the pages isolated from the LRU to be migrated. 540 * contains the pages isolated from the LRU to be migrated.
468 * The second list contains new pages that the pages isolated 541 * The second list contains new pages that the pages isolated
469 * can be moved to. 542 * can be moved to.
470 * 543 *
471 * The function returns after 10 attempts or if no pages 544 * The function returns after 10 attempts or if no pages
472 * are movable anymore because to has become empty 545 * are movable anymore because to has become empty
473 * or no retryable pages exist anymore. 546 * or no retryable pages exist anymore.
474 * 547 *
475 * Return: Number of pages not migrated when "to" ran empty. 548 * Return: Number of pages not migrated when "to" ran empty.
476 */ 549 */
477 int migrate_pages(struct list_head *from, struct list_head *to, 550 int migrate_pages(struct list_head *from, struct list_head *to,
478 struct list_head *moved, struct list_head *failed) 551 struct list_head *moved, struct list_head *failed)
479 { 552 {
480 int retry; 553 int retry;
481 int nr_failed = 0; 554 int nr_failed = 0;
482 int pass = 0; 555 int pass = 0;
483 struct page *page; 556 struct page *page;
484 struct page *page2; 557 struct page *page2;
485 int swapwrite = current->flags & PF_SWAPWRITE; 558 int swapwrite = current->flags & PF_SWAPWRITE;
486 int rc; 559 int rc;
487 560
488 if (!swapwrite) 561 if (!swapwrite)
489 current->flags |= PF_SWAPWRITE; 562 current->flags |= PF_SWAPWRITE;
490 563
491 redo: 564 redo:
492 retry = 0; 565 retry = 0;
493 566
494 list_for_each_entry_safe(page, page2, from, lru) { 567 list_for_each_entry_safe(page, page2, from, lru) {
495 struct page *newpage = NULL; 568 struct page *newpage = NULL;
496 struct address_space *mapping; 569 struct address_space *mapping;
497 570
498 cond_resched(); 571 cond_resched();
499 572
500 rc = 0; 573 rc = 0;
501 if (page_count(page) == 1) 574 if (page_count(page) == 1)
502 /* page was freed from under us. So we are done. */ 575 /* page was freed from under us. So we are done. */
503 goto next; 576 goto next;
504 577
505 if (to && list_empty(to)) 578 if (to && list_empty(to))
506 break; 579 break;
507 580
508 /* 581 /*
509 * Skip locked pages during the first two passes to give the 582 * Skip locked pages during the first two passes to give the
510 * functions holding the lock time to release the page. Later we 583 * functions holding the lock time to release the page. Later we
511 * use lock_page() to have a higher chance of acquiring the 584 * use lock_page() to have a higher chance of acquiring the
512 * lock. 585 * lock.
513 */ 586 */
514 rc = -EAGAIN; 587 rc = -EAGAIN;
515 if (pass > 2) 588 if (pass > 2)
516 lock_page(page); 589 lock_page(page);
517 else 590 else
518 if (TestSetPageLocked(page)) 591 if (TestSetPageLocked(page))
519 goto next; 592 goto next;
520 593
521 /* 594 /*
522 * Only wait on writeback if we have already done a pass where 595 * Only wait on writeback if we have already done a pass where
523 * we we may have triggered writeouts for lots of pages. 596 * we we may have triggered writeouts for lots of pages.
524 */ 597 */
525 if (pass > 0) 598 if (pass > 0)
526 wait_on_page_writeback(page); 599 wait_on_page_writeback(page);
527 else 600 else
528 if (PageWriteback(page)) 601 if (PageWriteback(page))
529 goto unlock_page; 602 goto unlock_page;
530 603
531 /* 604 /*
532 * Establish migration ptes or remove ptes 605 * Establish migration ptes or remove ptes
533 */ 606 */
534 rc = -EPERM; 607 rc = -EPERM;
535 if (try_to_unmap(page, 1) == SWAP_FAIL) 608 if (try_to_unmap(page, 1) == SWAP_FAIL)
536 /* A vma has VM_LOCKED set -> permanent failure */ 609 /* A vma has VM_LOCKED set -> permanent failure */
537 goto unlock_page; 610 goto unlock_page;
538 611
539 rc = -EAGAIN; 612 rc = -EAGAIN;
540 if (page_mapped(page)) 613 if (page_mapped(page))
541 goto unlock_page; 614 goto unlock_page;
542 615
543 newpage = lru_to_page(to); 616 newpage = lru_to_page(to);
544 lock_page(newpage); 617 lock_page(newpage);
545 /* Prepare mapping for the new page.*/ 618 /* Prepare mapping for the new page.*/
546 newpage->index = page->index; 619 newpage->index = page->index;
547 newpage->mapping = page->mapping; 620 newpage->mapping = page->mapping;
548 621
549 /* 622 /*
550 * Pages are properly locked and writeback is complete. 623 * Pages are properly locked and writeback is complete.
551 * Try to migrate the page. 624 * Try to migrate the page.
552 */ 625 */
553 mapping = page_mapping(page); 626 mapping = page_mapping(page);
554 if (!mapping) 627 if (!mapping)
555 rc = migrate_page(mapping, newpage, page); 628 rc = migrate_page(mapping, newpage, page);
556 629
557 else if (mapping->a_ops->migratepage) 630 else if (mapping->a_ops->migratepage)
558 /* 631 /*
559 * Most pages have a mapping and most filesystems 632 * Most pages have a mapping and most filesystems
560 * should provide a migration function. Anonymous 633 * should provide a migration function. Anonymous
561 * pages are part of swap space which also has its 634 * pages are part of swap space which also has its
562 * own migration function. This is the most common 635 * own migration function. This is the most common
563 * path for page migration. 636 * path for page migration.
564 */ 637 */
565 rc = mapping->a_ops->migratepage(mapping, 638 rc = mapping->a_ops->migratepage(mapping,
566 newpage, page); 639 newpage, page);
567 else 640 else
568 rc = fallback_migrate_page(mapping, newpage, page); 641 rc = fallback_migrate_page(mapping, newpage, page);
569 642
570 if (!rc) 643 if (!rc)
571 remove_migration_ptes(page, newpage); 644 remove_migration_ptes(page, newpage);
572 645
573 unlock_page(newpage); 646 unlock_page(newpage);
574 647
575 unlock_page: 648 unlock_page:
576 if (rc) 649 if (rc)
577 remove_migration_ptes(page, page); 650 remove_migration_ptes(page, page);
578 651
579 unlock_page(page); 652 unlock_page(page);
580 653
581 next: 654 next:
582 if (rc) { 655 if (rc) {
583 if (newpage) 656 if (newpage)
584 newpage->mapping = NULL; 657 newpage->mapping = NULL;
585 658
586 if (rc == -EAGAIN) 659 if (rc == -EAGAIN)
587 retry++; 660 retry++;
588 else { 661 else {
589 /* Permanent failure */ 662 /* Permanent failure */
590 list_move(&page->lru, failed); 663 list_move(&page->lru, failed);
591 nr_failed++; 664 nr_failed++;
592 } 665 }
593 } else { 666 } else {
594 if (newpage) { 667 if (newpage) {
595 /* Successful migration. Return page to LRU */ 668 /* Successful migration. Return page to LRU */
596 move_to_lru(newpage); 669 move_to_lru(newpage);
597 } 670 }
598 list_move(&page->lru, moved); 671 list_move(&page->lru, moved);
599 } 672 }
600 } 673 }
601 if (retry && pass++ < 10) 674 if (retry && pass++ < 10)
602 goto redo; 675 goto redo;
603 676
604 if (!swapwrite) 677 if (!swapwrite)
605 current->flags &= ~PF_SWAPWRITE; 678 current->flags &= ~PF_SWAPWRITE;
606 679
607 return nr_failed + retry; 680 return nr_failed + retry;
608 } 681 }
609 682
610 /* 683 /*
611 * Migrate the list 'pagelist' of pages to a certain destination. 684 * Migrate the list 'pagelist' of pages to a certain destination.
612 * 685 *
613 * Specify destination with either non-NULL vma or dest_node >= 0 686 * Specify destination with either non-NULL vma or dest_node >= 0
614 * Return the number of pages not migrated or error code 687 * Return the number of pages not migrated or error code
615 */ 688 */
616 int migrate_pages_to(struct list_head *pagelist, 689 int migrate_pages_to(struct list_head *pagelist,
617 struct vm_area_struct *vma, int dest) 690 struct vm_area_struct *vma, int dest)
618 { 691 {
619 LIST_HEAD(newlist); 692 LIST_HEAD(newlist);
620 LIST_HEAD(moved); 693 LIST_HEAD(moved);
621 LIST_HEAD(failed); 694 LIST_HEAD(failed);
622 int err = 0; 695 int err = 0;
623 unsigned long offset = 0; 696 unsigned long offset = 0;
624 int nr_pages; 697 int nr_pages;
625 struct page *page; 698 struct page *page;
626 struct list_head *p; 699 struct list_head *p;
627 700
628 redo: 701 redo:
629 nr_pages = 0; 702 nr_pages = 0;
630 list_for_each(p, pagelist) { 703 list_for_each(p, pagelist) {
631 if (vma) { 704 if (vma) {
632 /* 705 /*
633 * The address passed to alloc_page_vma is used to 706 * The address passed to alloc_page_vma is used to
634 * generate the proper interleave behavior. We fake 707 * generate the proper interleave behavior. We fake
635 * the address here by an increasing offset in order 708 * the address here by an increasing offset in order
636 * to get the proper distribution of pages. 709 * to get the proper distribution of pages.
637 * 710 *
638 * No decision has been made as to which page 711 * No decision has been made as to which page
639 * a certain old page is moved to so we cannot 712 * a certain old page is moved to so we cannot
640 * specify the correct address. 713 * specify the correct address.
641 */ 714 */
642 page = alloc_page_vma(GFP_HIGHUSER, vma, 715 page = alloc_page_vma(GFP_HIGHUSER, vma,
643 offset + vma->vm_start); 716 offset + vma->vm_start);
644 offset += PAGE_SIZE; 717 offset += PAGE_SIZE;
645 } 718 }
646 else 719 else
647 page = alloc_pages_node(dest, GFP_HIGHUSER, 0); 720 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
648 721
649 if (!page) { 722 if (!page) {
650 err = -ENOMEM; 723 err = -ENOMEM;
651 goto out; 724 goto out;
652 } 725 }
653 list_add_tail(&page->lru, &newlist); 726 list_add_tail(&page->lru, &newlist);
654 nr_pages++; 727 nr_pages++;
655 if (nr_pages > MIGRATE_CHUNK_SIZE) 728 if (nr_pages > MIGRATE_CHUNK_SIZE)
656 break; 729 break;
657 } 730 }
658 err = migrate_pages(pagelist, &newlist, &moved, &failed); 731 err = migrate_pages(pagelist, &newlist, &moved, &failed);
659 732
660 putback_lru_pages(&moved); /* Call release pages instead ?? */ 733 putback_lru_pages(&moved); /* Call release pages instead ?? */
661 734
662 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) 735 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
663 goto redo; 736 goto redo;
664 out: 737 out:
665 /* Return leftover allocated pages */ 738 /* Return leftover allocated pages */
666 while (!list_empty(&newlist)) { 739 while (!list_empty(&newlist)) {
667 page = list_entry(newlist.next, struct page, lru); 740 page = list_entry(newlist.next, struct page, lru);
668 list_del(&page->lru); 741 list_del(&page->lru);
669 __free_page(page); 742 __free_page(page);
670 } 743 }
671 list_splice(&failed, pagelist); 744 list_splice(&failed, pagelist);
672 if (err < 0) 745 if (err < 0)
673 return err; 746 return err;
674 747
675 /* Calculate number of leftover pages */ 748 /* Calculate number of leftover pages */
1 /* 1 /*
2 * mm/rmap.c - physical to virtual reverse mappings 2 * mm/rmap.c - physical to virtual reverse mappings
3 * 3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL). 5 * Released under the General Public License (GPL).
6 * 6 *
7 * Simple, low overhead reverse mapping scheme. 7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible. 8 * Please try to keep this thing as modular as possible.
9 * 9 *
10 * Provides methods for unmapping each kind of mapped page: 10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and 11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode. 12 * the file methods track pages belonging to an inode.
13 * 13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
18 */ 18 */
19 19
20 /* 20 /*
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem 24 * inode->i_alloc_sem
25 * 25 *
26 * When a page fault occurs in writing from user to file, down_read 26 * When a page fault occurs in writing from user to file, down_read
27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within 27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never 28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
29 * taken together; in truncation, i_mutex is taken outermost. 29 * taken together; in truncation, i_mutex is taken outermost.
30 * 30 *
31 * mm->mmap_sem 31 * mm->mmap_sem
32 * page->flags PG_locked (lock_page) 32 * page->flags PG_locked (lock_page)
33 * mapping->i_mmap_lock 33 * mapping->i_mmap_lock
34 * anon_vma->lock 34 * anon_vma->lock
35 * mm->page_table_lock or pte_lock 35 * mm->page_table_lock or pte_lock
36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * swap_lock (in swap_duplicate, swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * mmlist_lock (in mmput, drain_mmlist and others)
39 * mapping->private_lock (in __set_page_dirty_buffers) 39 * mapping->private_lock (in __set_page_dirty_buffers)
40 * inode_lock (in set_page_dirty's __mark_inode_dirty) 40 * inode_lock (in set_page_dirty's __mark_inode_dirty)
41 * sb_lock (within inode_lock in fs/fs-writeback.c) 41 * sb_lock (within inode_lock in fs/fs-writeback.c)
42 * mapping->tree_lock (widely used, in set_page_dirty, 42 * mapping->tree_lock (widely used, in set_page_dirty,
43 * in arch-dependent flush_dcache_mmap_lock, 43 * in arch-dependent flush_dcache_mmap_lock,
44 * within inode_lock in __sync_single_inode) 44 * within inode_lock in __sync_single_inode)
45 */ 45 */
46 46
47 #include <linux/mm.h> 47 #include <linux/mm.h>
48 #include <linux/pagemap.h> 48 #include <linux/pagemap.h>
49 #include <linux/swap.h> 49 #include <linux/swap.h>
50 #include <linux/swapops.h> 50 #include <linux/swapops.h>
51 #include <linux/slab.h> 51 #include <linux/slab.h>
52 #include <linux/init.h> 52 #include <linux/init.h>
53 #include <linux/rmap.h> 53 #include <linux/rmap.h>
54 #include <linux/rcupdate.h> 54 #include <linux/rcupdate.h>
55 #include <linux/module.h> 55 #include <linux/module.h>
56 56
57 #include <asm/tlbflush.h> 57 #include <asm/tlbflush.h>
58 58
59 struct kmem_cache *anon_vma_cachep; 59 struct kmem_cache *anon_vma_cachep;
60 60
61 static inline void validate_anon_vma(struct vm_area_struct *find_vma) 61 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
62 { 62 {
63 #ifdef CONFIG_DEBUG_VM 63 #ifdef CONFIG_DEBUG_VM
64 struct anon_vma *anon_vma = find_vma->anon_vma; 64 struct anon_vma *anon_vma = find_vma->anon_vma;
65 struct vm_area_struct *vma; 65 struct vm_area_struct *vma;
66 unsigned int mapcount = 0; 66 unsigned int mapcount = 0;
67 int found = 0; 67 int found = 0;
68 68
69 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 69 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
70 mapcount++; 70 mapcount++;
71 BUG_ON(mapcount > 100000); 71 BUG_ON(mapcount > 100000);
72 if (vma == find_vma) 72 if (vma == find_vma)
73 found = 1; 73 found = 1;
74 } 74 }
75 BUG_ON(!found); 75 BUG_ON(!found);
76 #endif 76 #endif
77 } 77 }
78 78
79 /* This must be called under the mmap_sem. */ 79 /* This must be called under the mmap_sem. */
80 int anon_vma_prepare(struct vm_area_struct *vma) 80 int anon_vma_prepare(struct vm_area_struct *vma)
81 { 81 {
82 struct anon_vma *anon_vma = vma->anon_vma; 82 struct anon_vma *anon_vma = vma->anon_vma;
83 83
84 might_sleep(); 84 might_sleep();
85 if (unlikely(!anon_vma)) { 85 if (unlikely(!anon_vma)) {
86 struct mm_struct *mm = vma->vm_mm; 86 struct mm_struct *mm = vma->vm_mm;
87 struct anon_vma *allocated, *locked; 87 struct anon_vma *allocated, *locked;
88 88
89 anon_vma = find_mergeable_anon_vma(vma); 89 anon_vma = find_mergeable_anon_vma(vma);
90 if (anon_vma) { 90 if (anon_vma) {
91 allocated = NULL; 91 allocated = NULL;
92 locked = anon_vma; 92 locked = anon_vma;
93 spin_lock(&locked->lock); 93 spin_lock(&locked->lock);
94 } else { 94 } else {
95 anon_vma = anon_vma_alloc(); 95 anon_vma = anon_vma_alloc();
96 if (unlikely(!anon_vma)) 96 if (unlikely(!anon_vma))
97 return -ENOMEM; 97 return -ENOMEM;
98 allocated = anon_vma; 98 allocated = anon_vma;
99 locked = NULL; 99 locked = NULL;
100 } 100 }
101 101
102 /* page_table_lock to protect against threads */ 102 /* page_table_lock to protect against threads */
103 spin_lock(&mm->page_table_lock); 103 spin_lock(&mm->page_table_lock);
104 if (likely(!vma->anon_vma)) { 104 if (likely(!vma->anon_vma)) {
105 vma->anon_vma = anon_vma; 105 vma->anon_vma = anon_vma;
106 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 106 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
107 allocated = NULL; 107 allocated = NULL;
108 } 108 }
109 spin_unlock(&mm->page_table_lock); 109 spin_unlock(&mm->page_table_lock);
110 110
111 if (locked) 111 if (locked)
112 spin_unlock(&locked->lock); 112 spin_unlock(&locked->lock);
113 if (unlikely(allocated)) 113 if (unlikely(allocated))
114 anon_vma_free(allocated); 114 anon_vma_free(allocated);
115 } 115 }
116 return 0; 116 return 0;
117 } 117 }
118 118
119 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 119 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
120 { 120 {
121 BUG_ON(vma->anon_vma != next->anon_vma); 121 BUG_ON(vma->anon_vma != next->anon_vma);
122 list_del(&next->anon_vma_node); 122 list_del(&next->anon_vma_node);
123 } 123 }
124 124
125 void __anon_vma_link(struct vm_area_struct *vma) 125 void __anon_vma_link(struct vm_area_struct *vma)
126 { 126 {
127 struct anon_vma *anon_vma = vma->anon_vma; 127 struct anon_vma *anon_vma = vma->anon_vma;
128 128
129 if (anon_vma) { 129 if (anon_vma) {
130 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 130 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
131 validate_anon_vma(vma); 131 validate_anon_vma(vma);
132 } 132 }
133 } 133 }
134 134
135 void anon_vma_link(struct vm_area_struct *vma) 135 void anon_vma_link(struct vm_area_struct *vma)
136 { 136 {
137 struct anon_vma *anon_vma = vma->anon_vma; 137 struct anon_vma *anon_vma = vma->anon_vma;
138 138
139 if (anon_vma) { 139 if (anon_vma) {
140 spin_lock(&anon_vma->lock); 140 spin_lock(&anon_vma->lock);
141 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 141 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
142 validate_anon_vma(vma); 142 validate_anon_vma(vma);
143 spin_unlock(&anon_vma->lock); 143 spin_unlock(&anon_vma->lock);
144 } 144 }
145 } 145 }
146 146
147 void anon_vma_unlink(struct vm_area_struct *vma) 147 void anon_vma_unlink(struct vm_area_struct *vma)
148 { 148 {
149 struct anon_vma *anon_vma = vma->anon_vma; 149 struct anon_vma *anon_vma = vma->anon_vma;
150 int empty; 150 int empty;
151 151
152 if (!anon_vma) 152 if (!anon_vma)
153 return; 153 return;
154 154
155 spin_lock(&anon_vma->lock); 155 spin_lock(&anon_vma->lock);
156 validate_anon_vma(vma); 156 validate_anon_vma(vma);
157 list_del(&vma->anon_vma_node); 157 list_del(&vma->anon_vma_node);
158 158
159 /* We must garbage collect the anon_vma if it's empty */ 159 /* We must garbage collect the anon_vma if it's empty */
160 empty = list_empty(&anon_vma->head); 160 empty = list_empty(&anon_vma->head);
161 spin_unlock(&anon_vma->lock); 161 spin_unlock(&anon_vma->lock);
162 162
163 if (empty) 163 if (empty)
164 anon_vma_free(anon_vma); 164 anon_vma_free(anon_vma);
165 } 165 }
166 166
167 static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 167 static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
168 unsigned long flags) 168 unsigned long flags)
169 { 169 {
170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
171 SLAB_CTOR_CONSTRUCTOR) { 171 SLAB_CTOR_CONSTRUCTOR) {
172 struct anon_vma *anon_vma = data; 172 struct anon_vma *anon_vma = data;
173 173
174 spin_lock_init(&anon_vma->lock); 174 spin_lock_init(&anon_vma->lock);
175 INIT_LIST_HEAD(&anon_vma->head); 175 INIT_LIST_HEAD(&anon_vma->head);
176 } 176 }
177 } 177 }
178 178
179 void __init anon_vma_init(void) 179 void __init anon_vma_init(void)
180 { 180 {
181 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 181 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
182 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); 182 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
183 } 183 }
184 184
185 /* 185 /*
186 * Getting a lock on a stable anon_vma from a page off the LRU is 186 * Getting a lock on a stable anon_vma from a page off the LRU is
187 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 187 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
188 */ 188 */
189 static struct anon_vma *page_lock_anon_vma(struct page *page) 189 static struct anon_vma *page_lock_anon_vma(struct page *page)
190 { 190 {
191 struct anon_vma *anon_vma = NULL; 191 struct anon_vma *anon_vma = NULL;
192 unsigned long anon_mapping; 192 unsigned long anon_mapping;
193 193
194 rcu_read_lock(); 194 rcu_read_lock();
195 anon_mapping = (unsigned long) page->mapping; 195 anon_mapping = (unsigned long) page->mapping;
196 if (!(anon_mapping & PAGE_MAPPING_ANON)) 196 if (!(anon_mapping & PAGE_MAPPING_ANON))
197 goto out; 197 goto out;
198 if (!page_mapped(page)) 198 if (!page_mapped(page))
199 goto out; 199 goto out;
200 200
201 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 201 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
202 spin_lock(&anon_vma->lock); 202 spin_lock(&anon_vma->lock);
203 out: 203 out:
204 rcu_read_unlock(); 204 rcu_read_unlock();
205 return anon_vma; 205 return anon_vma;
206 } 206 }
207 207
208 /* 208 /*
209 * At what user virtual address is page expected in vma? 209 * At what user virtual address is page expected in vma?
210 */ 210 */
211 static inline unsigned long 211 static inline unsigned long
212 vma_address(struct page *page, struct vm_area_struct *vma) 212 vma_address(struct page *page, struct vm_area_struct *vma)
213 { 213 {
214 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 214 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
215 unsigned long address; 215 unsigned long address;
216 216
217 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 217 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
218 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 218 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
219 /* page should be within any vma from prio_tree_next */ 219 /* page should be within any vma from prio_tree_next */
220 BUG_ON(!PageAnon(page)); 220 BUG_ON(!PageAnon(page));
221 return -EFAULT; 221 return -EFAULT;
222 } 222 }
223 return address; 223 return address;
224 } 224 }
225 225
226 /* 226 /*
227 * At what user virtual address is page expected in vma? checking that the 227 * At what user virtual address is page expected in vma? checking that the
228 * page matches the vma: currently only used on anon pages, by unuse_vma; 228 * page matches the vma: currently only used on anon pages, by unuse_vma;
229 */ 229 */
230 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 230 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
231 { 231 {
232 if (PageAnon(page)) { 232 if (PageAnon(page)) {
233 if ((void *)vma->anon_vma != 233 if ((void *)vma->anon_vma !=
234 (void *)page->mapping - PAGE_MAPPING_ANON) 234 (void *)page->mapping - PAGE_MAPPING_ANON)
235 return -EFAULT; 235 return -EFAULT;
236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
237 if (!vma->vm_file || 237 if (!vma->vm_file ||
238 vma->vm_file->f_mapping != page->mapping) 238 vma->vm_file->f_mapping != page->mapping)
239 return -EFAULT; 239 return -EFAULT;
240 } else 240 } else
241 return -EFAULT; 241 return -EFAULT;
242 return vma_address(page, vma); 242 return vma_address(page, vma);
243 } 243 }
244 244
245 /* 245 /*
246 * Check that @page is mapped at @address into @mm. 246 * Check that @page is mapped at @address into @mm.
247 * 247 *
248 * On success returns with pte mapped and locked. 248 * On success returns with pte mapped and locked.
249 */ 249 */
250 pte_t *page_check_address(struct page *page, struct mm_struct *mm, 250 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
251 unsigned long address, spinlock_t **ptlp) 251 unsigned long address, spinlock_t **ptlp)
252 { 252 {
253 pgd_t *pgd; 253 pgd_t *pgd;
254 pud_t *pud; 254 pud_t *pud;
255 pmd_t *pmd; 255 pmd_t *pmd;
256 pte_t *pte; 256 pte_t *pte;
257 spinlock_t *ptl; 257 spinlock_t *ptl;
258 258
259 pgd = pgd_offset(mm, address); 259 pgd = pgd_offset(mm, address);
260 if (!pgd_present(*pgd)) 260 if (!pgd_present(*pgd))
261 return NULL; 261 return NULL;
262 262
263 pud = pud_offset(pgd, address); 263 pud = pud_offset(pgd, address);
264 if (!pud_present(*pud)) 264 if (!pud_present(*pud))
265 return NULL; 265 return NULL;
266 266
267 pmd = pmd_offset(pud, address); 267 pmd = pmd_offset(pud, address);
268 if (!pmd_present(*pmd)) 268 if (!pmd_present(*pmd))
269 return NULL; 269 return NULL;
270 270
271 pte = pte_offset_map(pmd, address); 271 pte = pte_offset_map(pmd, address);
272 /* Make a quick check before getting the lock */ 272 /* Make a quick check before getting the lock */
273 if (!pte_present(*pte)) { 273 if (!pte_present(*pte)) {
274 pte_unmap(pte); 274 pte_unmap(pte);
275 return NULL; 275 return NULL;
276 } 276 }
277 277
278 ptl = pte_lockptr(mm, pmd); 278 ptl = pte_lockptr(mm, pmd);
279 spin_lock(ptl); 279 spin_lock(ptl);
280 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 280 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
281 *ptlp = ptl; 281 *ptlp = ptl;
282 return pte; 282 return pte;
283 } 283 }
284 pte_unmap_unlock(pte, ptl); 284 pte_unmap_unlock(pte, ptl);
285 return NULL; 285 return NULL;
286 } 286 }
287 287
288 /* 288 /*
289 * Subfunctions of page_referenced: page_referenced_one called 289 * Subfunctions of page_referenced: page_referenced_one called
290 * repeatedly from either page_referenced_anon or page_referenced_file. 290 * repeatedly from either page_referenced_anon or page_referenced_file.
291 */ 291 */
292 static int page_referenced_one(struct page *page, 292 static int page_referenced_one(struct page *page,
293 struct vm_area_struct *vma, unsigned int *mapcount) 293 struct vm_area_struct *vma, unsigned int *mapcount)
294 { 294 {
295 struct mm_struct *mm = vma->vm_mm; 295 struct mm_struct *mm = vma->vm_mm;
296 unsigned long address; 296 unsigned long address;
297 pte_t *pte; 297 pte_t *pte;
298 spinlock_t *ptl; 298 spinlock_t *ptl;
299 int referenced = 0; 299 int referenced = 0;
300 300
301 address = vma_address(page, vma); 301 address = vma_address(page, vma);
302 if (address == -EFAULT) 302 if (address == -EFAULT)
303 goto out; 303 goto out;
304 304
305 pte = page_check_address(page, mm, address, &ptl); 305 pte = page_check_address(page, mm, address, &ptl);
306 if (!pte) 306 if (!pte)
307 goto out; 307 goto out;
308 308
309 if (ptep_clear_flush_young(vma, address, pte)) 309 if (ptep_clear_flush_young(vma, address, pte))
310 referenced++; 310 referenced++;
311 311
312 /* Pretend the page is referenced if the task has the 312 /* Pretend the page is referenced if the task has the
313 swap token and is in the middle of a page fault. */ 313 swap token and is in the middle of a page fault. */
314 if (mm != current->mm && has_swap_token(mm) && 314 if (mm != current->mm && has_swap_token(mm) &&
315 rwsem_is_locked(&mm->mmap_sem)) 315 rwsem_is_locked(&mm->mmap_sem))
316 referenced++; 316 referenced++;
317 317
318 (*mapcount)--; 318 (*mapcount)--;
319 pte_unmap_unlock(pte, ptl); 319 pte_unmap_unlock(pte, ptl);
320 out: 320 out:
321 return referenced; 321 return referenced;
322 } 322 }
323 323
324 static int page_referenced_anon(struct page *page) 324 static int page_referenced_anon(struct page *page)
325 { 325 {
326 unsigned int mapcount; 326 unsigned int mapcount;
327 struct anon_vma *anon_vma; 327 struct anon_vma *anon_vma;
328 struct vm_area_struct *vma; 328 struct vm_area_struct *vma;
329 int referenced = 0; 329 int referenced = 0;
330 330
331 anon_vma = page_lock_anon_vma(page); 331 anon_vma = page_lock_anon_vma(page);
332 if (!anon_vma) 332 if (!anon_vma)
333 return referenced; 333 return referenced;
334 334
335 mapcount = page_mapcount(page); 335 mapcount = page_mapcount(page);
336 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 336 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
337 referenced += page_referenced_one(page, vma, &mapcount); 337 referenced += page_referenced_one(page, vma, &mapcount);
338 if (!mapcount) 338 if (!mapcount)
339 break; 339 break;
340 } 340 }
341 spin_unlock(&anon_vma->lock); 341 spin_unlock(&anon_vma->lock);
342 return referenced; 342 return referenced;
343 } 343 }
344 344
345 /** 345 /**
346 * page_referenced_file - referenced check for object-based rmap 346 * page_referenced_file - referenced check for object-based rmap
347 * @page: the page we're checking references on. 347 * @page: the page we're checking references on.
348 * 348 *
349 * For an object-based mapped page, find all the places it is mapped and 349 * For an object-based mapped page, find all the places it is mapped and
350 * check/clear the referenced flag. This is done by following the page->mapping 350 * check/clear the referenced flag. This is done by following the page->mapping
351 * pointer, then walking the chain of vmas it holds. It returns the number 351 * pointer, then walking the chain of vmas it holds. It returns the number
352 * of references it found. 352 * of references it found.
353 * 353 *
354 * This function is only called from page_referenced for object-based pages. 354 * This function is only called from page_referenced for object-based pages.
355 */ 355 */
356 static int page_referenced_file(struct page *page) 356 static int page_referenced_file(struct page *page)
357 { 357 {
358 unsigned int mapcount; 358 unsigned int mapcount;
359 struct address_space *mapping = page->mapping; 359 struct address_space *mapping = page->mapping;
360 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 360 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
361 struct vm_area_struct *vma; 361 struct vm_area_struct *vma;
362 struct prio_tree_iter iter; 362 struct prio_tree_iter iter;
363 int referenced = 0; 363 int referenced = 0;
364 364
365 /* 365 /*
366 * The caller's checks on page->mapping and !PageAnon have made 366 * The caller's checks on page->mapping and !PageAnon have made
367 * sure that this is a file page: the check for page->mapping 367 * sure that this is a file page: the check for page->mapping
368 * excludes the case just before it gets set on an anon page. 368 * excludes the case just before it gets set on an anon page.
369 */ 369 */
370 BUG_ON(PageAnon(page)); 370 BUG_ON(PageAnon(page));
371 371
372 /* 372 /*
373 * The page lock not only makes sure that page->mapping cannot 373 * The page lock not only makes sure that page->mapping cannot
374 * suddenly be NULLified by truncation, it makes sure that the 374 * suddenly be NULLified by truncation, it makes sure that the
375 * structure at mapping cannot be freed and reused yet, 375 * structure at mapping cannot be freed and reused yet,
376 * so we can safely take mapping->i_mmap_lock. 376 * so we can safely take mapping->i_mmap_lock.
377 */ 377 */
378 BUG_ON(!PageLocked(page)); 378 BUG_ON(!PageLocked(page));
379 379
380 spin_lock(&mapping->i_mmap_lock); 380 spin_lock(&mapping->i_mmap_lock);
381 381
382 /* 382 /*
383 * i_mmap_lock does not stabilize mapcount at all, but mapcount 383 * i_mmap_lock does not stabilize mapcount at all, but mapcount
384 * is more likely to be accurate if we note it after spinning. 384 * is more likely to be accurate if we note it after spinning.
385 */ 385 */
386 mapcount = page_mapcount(page); 386 mapcount = page_mapcount(page);
387 387
388 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 388 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
389 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 389 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
390 == (VM_LOCKED|VM_MAYSHARE)) { 390 == (VM_LOCKED|VM_MAYSHARE)) {
391 referenced++; 391 referenced++;
392 break; 392 break;
393 } 393 }
394 referenced += page_referenced_one(page, vma, &mapcount); 394 referenced += page_referenced_one(page, vma, &mapcount);
395 if (!mapcount) 395 if (!mapcount)
396 break; 396 break;
397 } 397 }
398 398
399 spin_unlock(&mapping->i_mmap_lock); 399 spin_unlock(&mapping->i_mmap_lock);
400 return referenced; 400 return referenced;
401 } 401 }
402 402
403 /** 403 /**
404 * page_referenced - test if the page was referenced 404 * page_referenced - test if the page was referenced
405 * @page: the page to test 405 * @page: the page to test
406 * @is_locked: caller holds lock on the page 406 * @is_locked: caller holds lock on the page
407 * 407 *
408 * Quick test_and_clear_referenced for all mappings to a page, 408 * Quick test_and_clear_referenced for all mappings to a page,
409 * returns the number of ptes which referenced the page. 409 * returns the number of ptes which referenced the page.
410 */ 410 */
411 int page_referenced(struct page *page, int is_locked) 411 int page_referenced(struct page *page, int is_locked)
412 { 412 {
413 int referenced = 0; 413 int referenced = 0;
414 414
415 if (page_test_and_clear_young(page)) 415 if (page_test_and_clear_young(page))
416 referenced++; 416 referenced++;
417 417
418 if (TestClearPageReferenced(page)) 418 if (TestClearPageReferenced(page))
419 referenced++; 419 referenced++;
420 420
421 if (page_mapped(page) && page->mapping) { 421 if (page_mapped(page) && page->mapping) {
422 if (PageAnon(page)) 422 if (PageAnon(page))
423 referenced += page_referenced_anon(page); 423 referenced += page_referenced_anon(page);
424 else if (is_locked) 424 else if (is_locked)
425 referenced += page_referenced_file(page); 425 referenced += page_referenced_file(page);
426 else if (TestSetPageLocked(page)) 426 else if (TestSetPageLocked(page))
427 referenced++; 427 referenced++;
428 else { 428 else {
429 if (page->mapping) 429 if (page->mapping)
430 referenced += page_referenced_file(page); 430 referenced += page_referenced_file(page);
431 unlock_page(page); 431 unlock_page(page);
432 } 432 }
433 } 433 }
434 return referenced; 434 return referenced;
435 } 435 }
436 436
437 /** 437 /**
438 * page_set_anon_rmap - setup new anonymous rmap 438 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 439 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added 440 * @vma: the vm area in which the mapping is added
441 * @address: the user virtual address mapped 441 * @address: the user virtual address mapped
442 */ 442 */
443 static void __page_set_anon_rmap(struct page *page, 443 static void __page_set_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address) 444 struct vm_area_struct *vma, unsigned long address)
445 { 445 {
446 struct anon_vma *anon_vma = vma->anon_vma; 446 struct anon_vma *anon_vma = vma->anon_vma;
447 447
448 BUG_ON(!anon_vma); 448 BUG_ON(!anon_vma);
449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
450 page->mapping = (struct address_space *) anon_vma; 450 page->mapping = (struct address_space *) anon_vma;
451 451
452 page->index = linear_page_index(vma, address); 452 page->index = linear_page_index(vma, address);
453 453
454 /* 454 /*
455 * nr_mapped state can be updated without turning off 455 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt. 456 * interrupts because it is not modified via interrupt.
457 */ 457 */
458 __inc_page_state(nr_mapped); 458 __inc_page_state(nr_mapped);
459 } 459 }
460 460
461 /** 461 /**
462 * page_add_anon_rmap - add pte mapping to an anonymous page 462 * page_add_anon_rmap - add pte mapping to an anonymous page
463 * @page: the page to add the mapping to 463 * @page: the page to add the mapping to
464 * @vma: the vm area in which the mapping is added 464 * @vma: the vm area in which the mapping is added
465 * @address: the user virtual address mapped 465 * @address: the user virtual address mapped
466 * 466 *
467 * The caller needs to hold the pte lock. 467 * The caller needs to hold the pte lock.
468 */ 468 */
469 void page_add_anon_rmap(struct page *page, 469 void page_add_anon_rmap(struct page *page,
470 struct vm_area_struct *vma, unsigned long address) 470 struct vm_area_struct *vma, unsigned long address)
471 { 471 {
472 if (atomic_inc_and_test(&page->_mapcount)) 472 if (atomic_inc_and_test(&page->_mapcount))
473 __page_set_anon_rmap(page, vma, address); 473 __page_set_anon_rmap(page, vma, address);
474 /* else checking page index and mapping is racy */ 474 /* else checking page index and mapping is racy */
475 } 475 }
476 476
477 /* 477 /*
478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
479 * @page: the page to add the mapping to 479 * @page: the page to add the mapping to
480 * @vma: the vm area in which the mapping is added 480 * @vma: the vm area in which the mapping is added
481 * @address: the user virtual address mapped 481 * @address: the user virtual address mapped
482 * 482 *
483 * Same as page_add_anon_rmap but must only be called on *new* pages. 483 * Same as page_add_anon_rmap but must only be called on *new* pages.
484 * This means the inc-and-test can be bypassed. 484 * This means the inc-and-test can be bypassed.
485 */ 485 */
486 void page_add_new_anon_rmap(struct page *page, 486 void page_add_new_anon_rmap(struct page *page,
487 struct vm_area_struct *vma, unsigned long address) 487 struct vm_area_struct *vma, unsigned long address)
488 { 488 {
489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
490 __page_set_anon_rmap(page, vma, address); 490 __page_set_anon_rmap(page, vma, address);
491 } 491 }
492 492
493 /** 493 /**
494 * page_add_file_rmap - add pte mapping to a file page 494 * page_add_file_rmap - add pte mapping to a file page
495 * @page: the page to add the mapping to 495 * @page: the page to add the mapping to
496 * 496 *
497 * The caller needs to hold the pte lock. 497 * The caller needs to hold the pte lock.
498 */ 498 */
499 void page_add_file_rmap(struct page *page) 499 void page_add_file_rmap(struct page *page)
500 { 500 {
501 if (atomic_inc_and_test(&page->_mapcount)) 501 if (atomic_inc_and_test(&page->_mapcount))
502 __inc_page_state(nr_mapped); 502 __inc_page_state(nr_mapped);
503 } 503 }
504 504
505 /** 505 /**
506 * page_remove_rmap - take down pte mapping from a page 506 * page_remove_rmap - take down pte mapping from a page
507 * @page: page to remove mapping from 507 * @page: page to remove mapping from
508 * 508 *
509 * The caller needs to hold the pte lock. 509 * The caller needs to hold the pte lock.
510 */ 510 */
511 void page_remove_rmap(struct page *page) 511 void page_remove_rmap(struct page *page)
512 { 512 {
513 if (atomic_add_negative(-1, &page->_mapcount)) { 513 if (atomic_add_negative(-1, &page->_mapcount)) {
514 #ifdef CONFIG_DEBUG_VM 514 #ifdef CONFIG_DEBUG_VM
515 if (unlikely(page_mapcount(page) < 0)) { 515 if (unlikely(page_mapcount(page) < 0)) {
516 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 516 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
517 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 517 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
518 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 518 printk (KERN_EMERG " page->count = %x\n", page_count(page));
519 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 519 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
520 } 520 }
521 #endif 521 #endif
522 BUG_ON(page_mapcount(page) < 0); 522 BUG_ON(page_mapcount(page) < 0);
523 /* 523 /*
524 * It would be tidy to reset the PageAnon mapping here, 524 * It would be tidy to reset the PageAnon mapping here,
525 * but that might overwrite a racing page_add_anon_rmap 525 * but that might overwrite a racing page_add_anon_rmap
526 * which increments mapcount after us but sets mapping 526 * which increments mapcount after us but sets mapping
527 * before us: so leave the reset to free_hot_cold_page, 527 * before us: so leave the reset to free_hot_cold_page,
528 * and remember that it's only reliable while mapped. 528 * and remember that it's only reliable while mapped.
529 * Leaving it set also helps swapoff to reinstate ptes 529 * Leaving it set also helps swapoff to reinstate ptes
530 * faster for those pages still in swapcache. 530 * faster for those pages still in swapcache.
531 */ 531 */
532 if (page_test_and_clear_dirty(page)) 532 if (page_test_and_clear_dirty(page))
533 set_page_dirty(page); 533 set_page_dirty(page);
534 __dec_page_state(nr_mapped); 534 __dec_page_state(nr_mapped);
535 } 535 }
536 } 536 }
537 537
538 /* 538 /*
539 * Subfunctions of try_to_unmap: try_to_unmap_one called 539 * Subfunctions of try_to_unmap: try_to_unmap_one called
540 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 540 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
541 */ 541 */
542 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 542 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
543 int migration) 543 int migration)
544 { 544 {
545 struct mm_struct *mm = vma->vm_mm; 545 struct mm_struct *mm = vma->vm_mm;
546 unsigned long address; 546 unsigned long address;
547 pte_t *pte; 547 pte_t *pte;
548 pte_t pteval; 548 pte_t pteval;
549 spinlock_t *ptl; 549 spinlock_t *ptl;
550 int ret = SWAP_AGAIN; 550 int ret = SWAP_AGAIN;
551 551
552 address = vma_address(page, vma); 552 address = vma_address(page, vma);
553 if (address == -EFAULT) 553 if (address == -EFAULT)
554 goto out; 554 goto out;
555 555
556 pte = page_check_address(page, mm, address, &ptl); 556 pte = page_check_address(page, mm, address, &ptl);
557 if (!pte) 557 if (!pte)
558 goto out; 558 goto out;
559 559
560 /* 560 /*
561 * If the page is mlock()d, we cannot swap it out. 561 * If the page is mlock()d, we cannot swap it out.
562 * If it's recently referenced (perhaps page_referenced 562 * If it's recently referenced (perhaps page_referenced
563 * skipped over this mm) then we should reactivate it. 563 * skipped over this mm) then we should reactivate it.
564 */ 564 */
565 if ((vma->vm_flags & VM_LOCKED) || 565 if ((vma->vm_flags & VM_LOCKED) ||
566 (ptep_clear_flush_young(vma, address, pte) 566 (ptep_clear_flush_young(vma, address, pte)
567 && !migration)) { 567 && !migration)) {
568 ret = SWAP_FAIL; 568 ret = SWAP_FAIL;
569 goto out_unmap; 569 goto out_unmap;
570 } 570 }
571 571
572 /* Nuke the page table entry. */ 572 /* Nuke the page table entry. */
573 flush_cache_page(vma, address, page_to_pfn(page)); 573 flush_cache_page(vma, address, page_to_pfn(page));
574 pteval = ptep_clear_flush(vma, address, pte); 574 pteval = ptep_clear_flush(vma, address, pte);
575 575
576 /* Move the dirty bit to the physical page now the pte is gone. */ 576 /* Move the dirty bit to the physical page now the pte is gone. */
577 if (pte_dirty(pteval)) 577 if (pte_dirty(pteval))
578 set_page_dirty(page); 578 set_page_dirty(page);
579 579
580 /* Update high watermark before we lower rss */ 580 /* Update high watermark before we lower rss */
581 update_hiwater_rss(mm); 581 update_hiwater_rss(mm);
582 582
583 if (PageAnon(page)) { 583 if (PageAnon(page)) {
584 swp_entry_t entry = { .val = page_private(page) }; 584 swp_entry_t entry = { .val = page_private(page) };
585 585
586 if (PageSwapCache(page)) { 586 if (PageSwapCache(page)) {
587 /* 587 /*
588 * Store the swap location in the pte. 588 * Store the swap location in the pte.
589 * See handle_pte_fault() ... 589 * See handle_pte_fault() ...
590 */ 590 */
591 swap_duplicate(entry); 591 swap_duplicate(entry);
592 if (list_empty(&mm->mmlist)) { 592 if (list_empty(&mm->mmlist)) {
593 spin_lock(&mmlist_lock); 593 spin_lock(&mmlist_lock);
594 if (list_empty(&mm->mmlist)) 594 if (list_empty(&mm->mmlist))
595 list_add(&mm->mmlist, &init_mm.mmlist); 595 list_add(&mm->mmlist, &init_mm.mmlist);
596 spin_unlock(&mmlist_lock); 596 spin_unlock(&mmlist_lock);
597 } 597 }
598 dec_mm_counter(mm, anon_rss); 598 dec_mm_counter(mm, anon_rss);
599 #ifdef CONFIG_MIGRATION
599 } else { 600 } else {
600 /* 601 /*
601 * Store the pfn of the page in a special migration 602 * Store the pfn of the page in a special migration
602 * pte. do_swap_page() will wait until the migration 603 * pte. do_swap_page() will wait until the migration
603 * pte is removed and then restart fault handling. 604 * pte is removed and then restart fault handling.
604 */ 605 */
605 BUG_ON(!migration); 606 BUG_ON(!migration);
606 entry = make_migration_entry(page, pte_write(pteval)); 607 entry = make_migration_entry(page, pte_write(pteval));
608 #endif
607 } 609 }
608 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 610 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
609 BUG_ON(pte_file(*pte)); 611 BUG_ON(pte_file(*pte));
610 } else 612 } else
613 #ifdef CONFIG_MIGRATION
614 if (migration) {
615 /* Establish migration entry for a file page */
616 swp_entry_t entry;
617 entry = make_migration_entry(page, pte_write(pteval));
618 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
619 } else
620 #endif
611 dec_mm_counter(mm, file_rss); 621 dec_mm_counter(mm, file_rss);
622
612 623
613 page_remove_rmap(page); 624 page_remove_rmap(page);
614 page_cache_release(page); 625 page_cache_release(page);
615 626
616 out_unmap: 627 out_unmap:
617 pte_unmap_unlock(pte, ptl); 628 pte_unmap_unlock(pte, ptl);
618 out: 629 out:
619 return ret; 630 return ret;
620 } 631 }
621 632
622 /* 633 /*
623 * objrmap doesn't work for nonlinear VMAs because the assumption that 634 * objrmap doesn't work for nonlinear VMAs because the assumption that
624 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 635 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
625 * Consequently, given a particular page and its ->index, we cannot locate the 636 * Consequently, given a particular page and its ->index, we cannot locate the
626 * ptes which are mapping that page without an exhaustive linear search. 637 * ptes which are mapping that page without an exhaustive linear search.
627 * 638 *
628 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 639 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
629 * maps the file to which the target page belongs. The ->vm_private_data field 640 * maps the file to which the target page belongs. The ->vm_private_data field
630 * holds the current cursor into that scan. Successive searches will circulate 641 * holds the current cursor into that scan. Successive searches will circulate
631 * around the vma's virtual address space. 642 * around the vma's virtual address space.
632 * 643 *
633 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 644 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
634 * more scanning pressure is placed against them as well. Eventually pages 645 * more scanning pressure is placed against them as well. Eventually pages
635 * will become fully unmapped and are eligible for eviction. 646 * will become fully unmapped and are eligible for eviction.
636 * 647 *
637 * For very sparsely populated VMAs this is a little inefficient - chances are 648 * For very sparsely populated VMAs this is a little inefficient - chances are
638 * there there won't be many ptes located within the scan cluster. In this case 649 * there there won't be many ptes located within the scan cluster. In this case
639 * maybe we could scan further - to the end of the pte page, perhaps. 650 * maybe we could scan further - to the end of the pte page, perhaps.
640 */ 651 */
641 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 652 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
642 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 653 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
643 654
644 static void try_to_unmap_cluster(unsigned long cursor, 655 static void try_to_unmap_cluster(unsigned long cursor,
645 unsigned int *mapcount, struct vm_area_struct *vma) 656 unsigned int *mapcount, struct vm_area_struct *vma)
646 { 657 {
647 struct mm_struct *mm = vma->vm_mm; 658 struct mm_struct *mm = vma->vm_mm;
648 pgd_t *pgd; 659 pgd_t *pgd;
649 pud_t *pud; 660 pud_t *pud;
650 pmd_t *pmd; 661 pmd_t *pmd;
651 pte_t *pte; 662 pte_t *pte;
652 pte_t pteval; 663 pte_t pteval;
653 spinlock_t *ptl; 664 spinlock_t *ptl;
654 struct page *page; 665 struct page *page;
655 unsigned long address; 666 unsigned long address;
656 unsigned long end; 667 unsigned long end;
657 668
658 address = (vma->vm_start + cursor) & CLUSTER_MASK; 669 address = (vma->vm_start + cursor) & CLUSTER_MASK;
659 end = address + CLUSTER_SIZE; 670 end = address + CLUSTER_SIZE;
660 if (address < vma->vm_start) 671 if (address < vma->vm_start)
661 address = vma->vm_start; 672 address = vma->vm_start;
662 if (end > vma->vm_end) 673 if (end > vma->vm_end)
663 end = vma->vm_end; 674 end = vma->vm_end;
664 675
665 pgd = pgd_offset(mm, address); 676 pgd = pgd_offset(mm, address);
666 if (!pgd_present(*pgd)) 677 if (!pgd_present(*pgd))
667 return; 678 return;
668 679
669 pud = pud_offset(pgd, address); 680 pud = pud_offset(pgd, address);
670 if (!pud_present(*pud)) 681 if (!pud_present(*pud))
671 return; 682 return;
672 683
673 pmd = pmd_offset(pud, address); 684 pmd = pmd_offset(pud, address);
674 if (!pmd_present(*pmd)) 685 if (!pmd_present(*pmd))
675 return; 686 return;
676 687
677 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 688 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
678 689
679 /* Update high watermark before we lower rss */ 690 /* Update high watermark before we lower rss */
680 update_hiwater_rss(mm); 691 update_hiwater_rss(mm);
681 692
682 for (; address < end; pte++, address += PAGE_SIZE) { 693 for (; address < end; pte++, address += PAGE_SIZE) {
683 if (!pte_present(*pte)) 694 if (!pte_present(*pte))
684 continue; 695 continue;
685 page = vm_normal_page(vma, address, *pte); 696 page = vm_normal_page(vma, address, *pte);
686 BUG_ON(!page || PageAnon(page)); 697 BUG_ON(!page || PageAnon(page));
687 698
688 if (ptep_clear_flush_young(vma, address, pte)) 699 if (ptep_clear_flush_young(vma, address, pte))
689 continue; 700 continue;
690 701
691 /* Nuke the page table entry. */ 702 /* Nuke the page table entry. */
692 flush_cache_page(vma, address, pte_pfn(*pte)); 703 flush_cache_page(vma, address, pte_pfn(*pte));
693 pteval = ptep_clear_flush(vma, address, pte); 704 pteval = ptep_clear_flush(vma, address, pte);
694 705
695 /* If nonlinear, store the file page offset in the pte. */ 706 /* If nonlinear, store the file page offset in the pte. */
696 if (page->index != linear_page_index(vma, address)) 707 if (page->index != linear_page_index(vma, address))
697 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 708 set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
698 709
699 /* Move the dirty bit to the physical page now the pte is gone. */ 710 /* Move the dirty bit to the physical page now the pte is gone. */
700 if (pte_dirty(pteval)) 711 if (pte_dirty(pteval))
701 set_page_dirty(page); 712 set_page_dirty(page);
702 713
703 page_remove_rmap(page); 714 page_remove_rmap(page);
704 page_cache_release(page); 715 page_cache_release(page);
705 dec_mm_counter(mm, file_rss); 716 dec_mm_counter(mm, file_rss);
706 (*mapcount)--; 717 (*mapcount)--;
707 } 718 }
708 pte_unmap_unlock(pte - 1, ptl); 719 pte_unmap_unlock(pte - 1, ptl);
709 } 720 }
710 721
711 static int try_to_unmap_anon(struct page *page, int migration) 722 static int try_to_unmap_anon(struct page *page, int migration)
712 { 723 {
713 struct anon_vma *anon_vma; 724 struct anon_vma *anon_vma;
714 struct vm_area_struct *vma; 725 struct vm_area_struct *vma;
715 int ret = SWAP_AGAIN; 726 int ret = SWAP_AGAIN;
716 727
717 anon_vma = page_lock_anon_vma(page); 728 anon_vma = page_lock_anon_vma(page);
718 if (!anon_vma) 729 if (!anon_vma)
719 return ret; 730 return ret;
720 731
721 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 732 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
722 ret = try_to_unmap_one(page, vma, migration); 733 ret = try_to_unmap_one(page, vma, migration);
723 if (ret == SWAP_FAIL || !page_mapped(page)) 734 if (ret == SWAP_FAIL || !page_mapped(page))
724 break; 735 break;
725 } 736 }
726 spin_unlock(&anon_vma->lock); 737 spin_unlock(&anon_vma->lock);
727 return ret; 738 return ret;
728 } 739 }
729 740
730 /** 741 /**
731 * try_to_unmap_file - unmap file page using the object-based rmap method 742 * try_to_unmap_file - unmap file page using the object-based rmap method
732 * @page: the page to unmap 743 * @page: the page to unmap
733 * 744 *
734 * Find all the mappings of a page using the mapping pointer and the vma chains 745 * Find all the mappings of a page using the mapping pointer and the vma chains
735 * contained in the address_space struct it points to. 746 * contained in the address_space struct it points to.
736 * 747 *
737 * This function is only called from try_to_unmap for object-based pages. 748 * This function is only called from try_to_unmap for object-based pages.
738 */ 749 */
739 static int try_to_unmap_file(struct page *page, int migration) 750 static int try_to_unmap_file(struct page *page, int migration)
740 { 751 {
741 struct address_space *mapping = page->mapping; 752 struct address_space *mapping = page->mapping;
742 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 753 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
743 struct vm_area_struct *vma; 754 struct vm_area_struct *vma;
744 struct prio_tree_iter iter; 755 struct prio_tree_iter iter;
745 int ret = SWAP_AGAIN; 756 int ret = SWAP_AGAIN;
746 unsigned long cursor; 757 unsigned long cursor;
747 unsigned long max_nl_cursor = 0; 758 unsigned long max_nl_cursor = 0;
748 unsigned long max_nl_size = 0; 759 unsigned long max_nl_size = 0;
749 unsigned int mapcount; 760 unsigned int mapcount;
750 761
751 spin_lock(&mapping->i_mmap_lock); 762 spin_lock(&mapping->i_mmap_lock);
752 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 763 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
753 ret = try_to_unmap_one(page, vma, migration); 764 ret = try_to_unmap_one(page, vma, migration);
754 if (ret == SWAP_FAIL || !page_mapped(page)) 765 if (ret == SWAP_FAIL || !page_mapped(page))
755 goto out; 766 goto out;
756 } 767 }
757 768
758 if (list_empty(&mapping->i_mmap_nonlinear)) 769 if (list_empty(&mapping->i_mmap_nonlinear))
759 goto out; 770 goto out;
760 771
761 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 772 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
762 shared.vm_set.list) { 773 shared.vm_set.list) {
763 if (vma->vm_flags & VM_LOCKED) 774 if (vma->vm_flags & VM_LOCKED)
764 continue; 775 continue;
765 cursor = (unsigned long) vma->vm_private_data; 776 cursor = (unsigned long) vma->vm_private_data;
766 if (cursor > max_nl_cursor) 777 if (cursor > max_nl_cursor)
767 max_nl_cursor = cursor; 778 max_nl_cursor = cursor;
768 cursor = vma->vm_end - vma->vm_start; 779 cursor = vma->vm_end - vma->vm_start;
769 if (cursor > max_nl_size) 780 if (cursor > max_nl_size)
770 max_nl_size = cursor; 781 max_nl_size = cursor;
771 } 782 }
772 783
773 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 784 if (max_nl_size == 0) { /* any nonlinears locked or reserved */
774 ret = SWAP_FAIL; 785 ret = SWAP_FAIL;
775 goto out; 786 goto out;
776 } 787 }
777 788
778 /* 789 /*
779 * We don't try to search for this page in the nonlinear vmas, 790 * We don't try to search for this page in the nonlinear vmas,
780 * and page_referenced wouldn't have found it anyway. Instead 791 * and page_referenced wouldn't have found it anyway. Instead
781 * just walk the nonlinear vmas trying to age and unmap some. 792 * just walk the nonlinear vmas trying to age and unmap some.
782 * The mapcount of the page we came in with is irrelevant, 793 * The mapcount of the page we came in with is irrelevant,
783 * but even so use it as a guide to how hard we should try? 794 * but even so use it as a guide to how hard we should try?
784 */ 795 */
785 mapcount = page_mapcount(page); 796 mapcount = page_mapcount(page);
786 if (!mapcount) 797 if (!mapcount)
787 goto out; 798 goto out;
788 cond_resched_lock(&mapping->i_mmap_lock); 799 cond_resched_lock(&mapping->i_mmap_lock);
789 800
790 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 801 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
791 if (max_nl_cursor == 0) 802 if (max_nl_cursor == 0)
792 max_nl_cursor = CLUSTER_SIZE; 803 max_nl_cursor = CLUSTER_SIZE;
793 804
794 do { 805 do {
795 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 806 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
796 shared.vm_set.list) { 807 shared.vm_set.list) {
797 if (vma->vm_flags & VM_LOCKED) 808 if (vma->vm_flags & VM_LOCKED)
798 continue; 809 continue;
799 cursor = (unsigned long) vma->vm_private_data; 810 cursor = (unsigned long) vma->vm_private_data;
800 while ( cursor < max_nl_cursor && 811 while ( cursor < max_nl_cursor &&
801 cursor < vma->vm_end - vma->vm_start) { 812 cursor < vma->vm_end - vma->vm_start) {
802 try_to_unmap_cluster(cursor, &mapcount, vma); 813 try_to_unmap_cluster(cursor, &mapcount, vma);
803 cursor += CLUSTER_SIZE; 814 cursor += CLUSTER_SIZE;
804 vma->vm_private_data = (void *) cursor; 815 vma->vm_private_data = (void *) cursor;
805 if ((int)mapcount <= 0) 816 if ((int)mapcount <= 0)
806 goto out; 817 goto out;
807 } 818 }
808 vma->vm_private_data = (void *) max_nl_cursor; 819 vma->vm_private_data = (void *) max_nl_cursor;
809 } 820 }
810 cond_resched_lock(&mapping->i_mmap_lock); 821 cond_resched_lock(&mapping->i_mmap_lock);
811 max_nl_cursor += CLUSTER_SIZE; 822 max_nl_cursor += CLUSTER_SIZE;
812 } while (max_nl_cursor <= max_nl_size); 823 } while (max_nl_cursor <= max_nl_size);
813 824
814 /* 825 /*
815 * Don't loop forever (perhaps all the remaining pages are 826 * Don't loop forever (perhaps all the remaining pages are
816 * in locked vmas). Reset cursor on all unreserved nonlinear 827 * in locked vmas). Reset cursor on all unreserved nonlinear
817 * vmas, now forgetting on which ones it had fallen behind. 828 * vmas, now forgetting on which ones it had fallen behind.
818 */ 829 */
819 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 830 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
820 vma->vm_private_data = NULL; 831 vma->vm_private_data = NULL;
821 out: 832 out:
822 spin_unlock(&mapping->i_mmap_lock); 833 spin_unlock(&mapping->i_mmap_lock);
823 return ret; 834 return ret;
824 } 835 }
825 836
826 /** 837 /**
827 * try_to_unmap - try to remove all page table mappings to a page 838 * try_to_unmap - try to remove all page table mappings to a page
828 * @page: the page to get unmapped 839 * @page: the page to get unmapped
829 * 840 *
830 * Tries to remove all the page table entries which are mapping this 841 * Tries to remove all the page table entries which are mapping this
831 * page, used in the pageout path. Caller must hold the page lock. 842 * page, used in the pageout path. Caller must hold the page lock.
832 * Return values are: 843 * Return values are:
833 * 844 *
834 * SWAP_SUCCESS - we succeeded in removing all mappings 845 * SWAP_SUCCESS - we succeeded in removing all mappings
835 * SWAP_AGAIN - we missed a mapping, try again later 846 * SWAP_AGAIN - we missed a mapping, try again later
836 * SWAP_FAIL - the page is unswappable 847 * SWAP_FAIL - the page is unswappable
837 */ 848 */
838 int try_to_unmap(struct page *page, int migration) 849 int try_to_unmap(struct page *page, int migration)
839 { 850 {
840 int ret; 851 int ret;
841 852
842 BUG_ON(!PageLocked(page)); 853 BUG_ON(!PageLocked(page));
843 854
844 if (PageAnon(page)) 855 if (PageAnon(page))
845 ret = try_to_unmap_anon(page, migration); 856 ret = try_to_unmap_anon(page, migration);
846 else 857 else
847 ret = try_to_unmap_file(page, migration); 858 ret = try_to_unmap_file(page, migration);
848 859
849 if (!page_mapped(page)) 860 if (!page_mapped(page))
850 ret = SWAP_SUCCESS; 861 ret = SWAP_SUCCESS;
851 return ret; 862 return ret;
852 } 863 }
853 864
854 865
1 /* 1 /*
2 * linux/mm/vmscan.c 2 * linux/mm/vmscan.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * 5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct 7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 * Multiqueue VM started 5.8.00, Rik van Riel. 11 * Multiqueue VM started 5.8.00, Rik van Riel.
12 */ 12 */
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/kernel_stat.h> 17 #include <linux/kernel_stat.h>
18 #include <linux/swap.h> 18 #include <linux/swap.h>
19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h>
20 #include <linux/init.h> 20 #include <linux/init.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/file.h> 22 #include <linux/file.h>
23 #include <linux/writeback.h> 23 #include <linux/writeback.h>
24 #include <linux/blkdev.h> 24 #include <linux/blkdev.h>
25 #include <linux/buffer_head.h> /* for try_to_release_page(), 25 #include <linux/buffer_head.h> /* for try_to_release_page(),
26 buffer_heads_over_limit */ 26 buffer_heads_over_limit */
27 #include <linux/mm_inline.h> 27 #include <linux/mm_inline.h>
28 #include <linux/pagevec.h> 28 #include <linux/pagevec.h>
29 #include <linux/backing-dev.h> 29 #include <linux/backing-dev.h>
30 #include <linux/rmap.h> 30 #include <linux/rmap.h>
31 #include <linux/topology.h> 31 #include <linux/topology.h>
32 #include <linux/cpu.h> 32 #include <linux/cpu.h>
33 #include <linux/cpuset.h> 33 #include <linux/cpuset.h>
34 #include <linux/notifier.h> 34 #include <linux/notifier.h>
35 #include <linux/rwsem.h> 35 #include <linux/rwsem.h>
36 #include <linux/delay.h> 36 #include <linux/delay.h>
37 37
38 #include <asm/tlbflush.h> 38 #include <asm/tlbflush.h>
39 #include <asm/div64.h> 39 #include <asm/div64.h>
40 40
41 #include <linux/swapops.h> 41 #include <linux/swapops.h>
42 42
43 #include "internal.h" 43 #include "internal.h"
44 44
45 struct scan_control { 45 struct scan_control {
46 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
47 unsigned long nr_scanned; 47 unsigned long nr_scanned;
48 48
49 unsigned long nr_mapped; /* From page_state */ 49 unsigned long nr_mapped; /* From page_state */
50 50
51 /* This context's GFP mask */ 51 /* This context's GFP mask */
52 gfp_t gfp_mask; 52 gfp_t gfp_mask;
53 53
54 int may_writepage; 54 int may_writepage;
55 55
56 /* Can pages be swapped as part of reclaim? */ 56 /* Can pages be swapped as part of reclaim? */
57 int may_swap; 57 int may_swap;
58 58
59 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 59 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
60 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 60 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
61 * In this context, it doesn't matter that we scan the 61 * In this context, it doesn't matter that we scan the
62 * whole list at once. */ 62 * whole list at once. */
63 int swap_cluster_max; 63 int swap_cluster_max;
64 64
65 int swappiness; 65 int swappiness;
66 }; 66 };
67 67
68 /* 68 /*
69 * The list of shrinker callbacks used by to apply pressure to 69 * The list of shrinker callbacks used by to apply pressure to
70 * ageable caches. 70 * ageable caches.
71 */ 71 */
72 struct shrinker { 72 struct shrinker {
73 shrinker_t shrinker; 73 shrinker_t shrinker;
74 struct list_head list; 74 struct list_head list;
75 int seeks; /* seeks to recreate an obj */ 75 int seeks; /* seeks to recreate an obj */
76 long nr; /* objs pending delete */ 76 long nr; /* objs pending delete */
77 }; 77 };
78 78
79 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 79 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
80 80
81 #ifdef ARCH_HAS_PREFETCH 81 #ifdef ARCH_HAS_PREFETCH
82 #define prefetch_prev_lru_page(_page, _base, _field) \ 82 #define prefetch_prev_lru_page(_page, _base, _field) \
83 do { \ 83 do { \
84 if ((_page)->lru.prev != _base) { \ 84 if ((_page)->lru.prev != _base) { \
85 struct page *prev; \ 85 struct page *prev; \
86 \ 86 \
87 prev = lru_to_page(&(_page->lru)); \ 87 prev = lru_to_page(&(_page->lru)); \
88 prefetch(&prev->_field); \ 88 prefetch(&prev->_field); \
89 } \ 89 } \
90 } while (0) 90 } while (0)
91 #else 91 #else
92 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 92 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
93 #endif 93 #endif
94 94
95 #ifdef ARCH_HAS_PREFETCHW 95 #ifdef ARCH_HAS_PREFETCHW
96 #define prefetchw_prev_lru_page(_page, _base, _field) \ 96 #define prefetchw_prev_lru_page(_page, _base, _field) \
97 do { \ 97 do { \
98 if ((_page)->lru.prev != _base) { \ 98 if ((_page)->lru.prev != _base) { \
99 struct page *prev; \ 99 struct page *prev; \
100 \ 100 \
101 prev = lru_to_page(&(_page->lru)); \ 101 prev = lru_to_page(&(_page->lru)); \
102 prefetchw(&prev->_field); \ 102 prefetchw(&prev->_field); \
103 } \ 103 } \
104 } while (0) 104 } while (0)
105 #else 105 #else
106 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 106 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
107 #endif 107 #endif
108 108
109 /* 109 /*
110 * From 0 .. 100. Higher means more swappy. 110 * From 0 .. 100. Higher means more swappy.
111 */ 111 */
112 int vm_swappiness = 60; 112 int vm_swappiness = 60;
113 static long total_memory; 113 static long total_memory;
114 114
115 static LIST_HEAD(shrinker_list); 115 static LIST_HEAD(shrinker_list);
116 static DECLARE_RWSEM(shrinker_rwsem); 116 static DECLARE_RWSEM(shrinker_rwsem);
117 117
118 /* 118 /*
119 * Add a shrinker callback to be called from the vm 119 * Add a shrinker callback to be called from the vm
120 */ 120 */
121 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 121 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
122 { 122 {
123 struct shrinker *shrinker; 123 struct shrinker *shrinker;
124 124
125 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 125 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
126 if (shrinker) { 126 if (shrinker) {
127 shrinker->shrinker = theshrinker; 127 shrinker->shrinker = theshrinker;
128 shrinker->seeks = seeks; 128 shrinker->seeks = seeks;
129 shrinker->nr = 0; 129 shrinker->nr = 0;
130 down_write(&shrinker_rwsem); 130 down_write(&shrinker_rwsem);
131 list_add_tail(&shrinker->list, &shrinker_list); 131 list_add_tail(&shrinker->list, &shrinker_list);
132 up_write(&shrinker_rwsem); 132 up_write(&shrinker_rwsem);
133 } 133 }
134 return shrinker; 134 return shrinker;
135 } 135 }
136 EXPORT_SYMBOL(set_shrinker); 136 EXPORT_SYMBOL(set_shrinker);
137 137
138 /* 138 /*
139 * Remove one 139 * Remove one
140 */ 140 */
141 void remove_shrinker(struct shrinker *shrinker) 141 void remove_shrinker(struct shrinker *shrinker)
142 { 142 {
143 down_write(&shrinker_rwsem); 143 down_write(&shrinker_rwsem);
144 list_del(&shrinker->list); 144 list_del(&shrinker->list);
145 up_write(&shrinker_rwsem); 145 up_write(&shrinker_rwsem);
146 kfree(shrinker); 146 kfree(shrinker);
147 } 147 }
148 EXPORT_SYMBOL(remove_shrinker); 148 EXPORT_SYMBOL(remove_shrinker);
149 149
150 #define SHRINK_BATCH 128 150 #define SHRINK_BATCH 128
151 /* 151 /*
152 * Call the shrink functions to age shrinkable caches 152 * Call the shrink functions to age shrinkable caches
153 * 153 *
154 * Here we assume it costs one seek to replace a lru page and that it also 154 * Here we assume it costs one seek to replace a lru page and that it also
155 * takes a seek to recreate a cache object. With this in mind we age equal 155 * takes a seek to recreate a cache object. With this in mind we age equal
156 * percentages of the lru and ageable caches. This should balance the seeks 156 * percentages of the lru and ageable caches. This should balance the seeks
157 * generated by these structures. 157 * generated by these structures.
158 * 158 *
159 * If the vm encounted mapped pages on the LRU it increase the pressure on 159 * If the vm encounted mapped pages on the LRU it increase the pressure on
160 * slab to avoid swapping. 160 * slab to avoid swapping.
161 * 161 *
162 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 162 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
163 * 163 *
164 * `lru_pages' represents the number of on-LRU pages in all the zones which 164 * `lru_pages' represents the number of on-LRU pages in all the zones which
165 * are eligible for the caller's allocation attempt. It is used for balancing 165 * are eligible for the caller's allocation attempt. It is used for balancing
166 * slab reclaim versus page reclaim. 166 * slab reclaim versus page reclaim.
167 * 167 *
168 * Returns the number of slab objects which we shrunk. 168 * Returns the number of slab objects which we shrunk.
169 */ 169 */
170 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 170 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
171 unsigned long lru_pages) 171 unsigned long lru_pages)
172 { 172 {
173 struct shrinker *shrinker; 173 struct shrinker *shrinker;
174 unsigned long ret = 0; 174 unsigned long ret = 0;
175 175
176 if (scanned == 0) 176 if (scanned == 0)
177 scanned = SWAP_CLUSTER_MAX; 177 scanned = SWAP_CLUSTER_MAX;
178 178
179 if (!down_read_trylock(&shrinker_rwsem)) 179 if (!down_read_trylock(&shrinker_rwsem))
180 return 1; /* Assume we'll be able to shrink next time */ 180 return 1; /* Assume we'll be able to shrink next time */
181 181
182 list_for_each_entry(shrinker, &shrinker_list, list) { 182 list_for_each_entry(shrinker, &shrinker_list, list) {
183 unsigned long long delta; 183 unsigned long long delta;
184 unsigned long total_scan; 184 unsigned long total_scan;
185 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); 185 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
186 186
187 delta = (4 * scanned) / shrinker->seeks; 187 delta = (4 * scanned) / shrinker->seeks;
188 delta *= max_pass; 188 delta *= max_pass;
189 do_div(delta, lru_pages + 1); 189 do_div(delta, lru_pages + 1);
190 shrinker->nr += delta; 190 shrinker->nr += delta;
191 if (shrinker->nr < 0) { 191 if (shrinker->nr < 0) {
192 printk(KERN_ERR "%s: nr=%ld\n", 192 printk(KERN_ERR "%s: nr=%ld\n",
193 __FUNCTION__, shrinker->nr); 193 __FUNCTION__, shrinker->nr);
194 shrinker->nr = max_pass; 194 shrinker->nr = max_pass;
195 } 195 }
196 196
197 /* 197 /*
198 * Avoid risking looping forever due to too large nr value: 198 * Avoid risking looping forever due to too large nr value:
199 * never try to free more than twice the estimate number of 199 * never try to free more than twice the estimate number of
200 * freeable entries. 200 * freeable entries.
201 */ 201 */
202 if (shrinker->nr > max_pass * 2) 202 if (shrinker->nr > max_pass * 2)
203 shrinker->nr = max_pass * 2; 203 shrinker->nr = max_pass * 2;
204 204
205 total_scan = shrinker->nr; 205 total_scan = shrinker->nr;
206 shrinker->nr = 0; 206 shrinker->nr = 0;
207 207
208 while (total_scan >= SHRINK_BATCH) { 208 while (total_scan >= SHRINK_BATCH) {
209 long this_scan = SHRINK_BATCH; 209 long this_scan = SHRINK_BATCH;
210 int shrink_ret; 210 int shrink_ret;
211 int nr_before; 211 int nr_before;
212 212
213 nr_before = (*shrinker->shrinker)(0, gfp_mask); 213 nr_before = (*shrinker->shrinker)(0, gfp_mask);
214 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 214 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
215 if (shrink_ret == -1) 215 if (shrink_ret == -1)
216 break; 216 break;
217 if (shrink_ret < nr_before) 217 if (shrink_ret < nr_before)
218 ret += nr_before - shrink_ret; 218 ret += nr_before - shrink_ret;
219 mod_page_state(slabs_scanned, this_scan); 219 mod_page_state(slabs_scanned, this_scan);
220 total_scan -= this_scan; 220 total_scan -= this_scan;
221 221
222 cond_resched(); 222 cond_resched();
223 } 223 }
224 224
225 shrinker->nr += total_scan; 225 shrinker->nr += total_scan;
226 } 226 }
227 up_read(&shrinker_rwsem); 227 up_read(&shrinker_rwsem);
228 return ret; 228 return ret;
229 } 229 }
230 230
231 /* Called without lock on whether page is mapped, so answer is unstable */ 231 /* Called without lock on whether page is mapped, so answer is unstable */
232 static inline int page_mapping_inuse(struct page *page) 232 static inline int page_mapping_inuse(struct page *page)
233 { 233 {
234 struct address_space *mapping; 234 struct address_space *mapping;
235 235
236 /* Page is in somebody's page tables. */ 236 /* Page is in somebody's page tables. */
237 if (page_mapped(page)) 237 if (page_mapped(page))
238 return 1; 238 return 1;
239 239
240 /* Be more reluctant to reclaim swapcache than pagecache */ 240 /* Be more reluctant to reclaim swapcache than pagecache */
241 if (PageSwapCache(page)) 241 if (PageSwapCache(page))
242 return 1; 242 return 1;
243 243
244 mapping = page_mapping(page); 244 mapping = page_mapping(page);
245 if (!mapping) 245 if (!mapping)
246 return 0; 246 return 0;
247 247
248 /* File is mmap'd by somebody? */ 248 /* File is mmap'd by somebody? */
249 return mapping_mapped(mapping); 249 return mapping_mapped(mapping);
250 } 250 }
251 251
252 static inline int is_page_cache_freeable(struct page *page) 252 static inline int is_page_cache_freeable(struct page *page)
253 { 253 {
254 return page_count(page) - !!PagePrivate(page) == 2; 254 return page_count(page) - !!PagePrivate(page) == 2;
255 } 255 }
256 256
257 static int may_write_to_queue(struct backing_dev_info *bdi) 257 static int may_write_to_queue(struct backing_dev_info *bdi)
258 { 258 {
259 if (current->flags & PF_SWAPWRITE) 259 if (current->flags & PF_SWAPWRITE)
260 return 1; 260 return 1;
261 if (!bdi_write_congested(bdi)) 261 if (!bdi_write_congested(bdi))
262 return 1; 262 return 1;
263 if (bdi == current->backing_dev_info) 263 if (bdi == current->backing_dev_info)
264 return 1; 264 return 1;
265 return 0; 265 return 0;
266 } 266 }
267 267
268 /* 268 /*
269 * We detected a synchronous write error writing a page out. Probably 269 * We detected a synchronous write error writing a page out. Probably
270 * -ENOSPC. We need to propagate that into the address_space for a subsequent 270 * -ENOSPC. We need to propagate that into the address_space for a subsequent
271 * fsync(), msync() or close(). 271 * fsync(), msync() or close().
272 * 272 *
273 * The tricky part is that after writepage we cannot touch the mapping: nothing 273 * The tricky part is that after writepage we cannot touch the mapping: nothing
274 * prevents it from being freed up. But we have a ref on the page and once 274 * prevents it from being freed up. But we have a ref on the page and once
275 * that page is locked, the mapping is pinned. 275 * that page is locked, the mapping is pinned.
276 * 276 *
277 * We're allowed to run sleeping lock_page() here because we know the caller has 277 * We're allowed to run sleeping lock_page() here because we know the caller has
278 * __GFP_FS. 278 * __GFP_FS.
279 */ 279 */
280 static void handle_write_error(struct address_space *mapping, 280 static void handle_write_error(struct address_space *mapping,
281 struct page *page, int error) 281 struct page *page, int error)
282 { 282 {
283 lock_page(page); 283 lock_page(page);
284 if (page_mapping(page) == mapping) { 284 if (page_mapping(page) == mapping) {
285 if (error == -ENOSPC) 285 if (error == -ENOSPC)
286 set_bit(AS_ENOSPC, &mapping->flags); 286 set_bit(AS_ENOSPC, &mapping->flags);
287 else 287 else
288 set_bit(AS_EIO, &mapping->flags); 288 set_bit(AS_EIO, &mapping->flags);
289 } 289 }
290 unlock_page(page); 290 unlock_page(page);
291 } 291 }
292 292
293 /* possible outcome of pageout() */
294 typedef enum {
295 /* failed to write page out, page is locked */
296 PAGE_KEEP,
297 /* move page to the active list, page is locked */
298 PAGE_ACTIVATE,
299 /* page has been sent to the disk successfully, page is unlocked */
300 PAGE_SUCCESS,
301 /* page is clean and locked */
302 PAGE_CLEAN,
303 } pageout_t;
304
293 /* 305 /*
294 * pageout is called by shrink_page_list() for each dirty page. 306 * pageout is called by shrink_page_list() for each dirty page.
295 * Calls ->writepage(). 307 * Calls ->writepage().
296 */ 308 */
297 pageout_t pageout(struct page *page, struct address_space *mapping) 309 static pageout_t pageout(struct page *page, struct address_space *mapping)
298 { 310 {
299 /* 311 /*
300 * If the page is dirty, only perform writeback if that write 312 * If the page is dirty, only perform writeback if that write
301 * will be non-blocking. To prevent this allocation from being 313 * will be non-blocking. To prevent this allocation from being
302 * stalled by pagecache activity. But note that there may be 314 * stalled by pagecache activity. But note that there may be
303 * stalls if we need to run get_block(). We could test 315 * stalls if we need to run get_block(). We could test
304 * PagePrivate for that. 316 * PagePrivate for that.
305 * 317 *
306 * If this process is currently in generic_file_write() against 318 * If this process is currently in generic_file_write() against
307 * this page's queue, we can perform writeback even if that 319 * this page's queue, we can perform writeback even if that
308 * will block. 320 * will block.
309 * 321 *
310 * If the page is swapcache, write it back even if that would 322 * If the page is swapcache, write it back even if that would
311 * block, for some throttling. This happens by accident, because 323 * block, for some throttling. This happens by accident, because
312 * swap_backing_dev_info is bust: it doesn't reflect the 324 * swap_backing_dev_info is bust: it doesn't reflect the
313 * congestion state of the swapdevs. Easy to fix, if needed. 325 * congestion state of the swapdevs. Easy to fix, if needed.
314 * See swapfile.c:page_queue_congested(). 326 * See swapfile.c:page_queue_congested().
315 */ 327 */
316 if (!is_page_cache_freeable(page)) 328 if (!is_page_cache_freeable(page))
317 return PAGE_KEEP; 329 return PAGE_KEEP;
318 if (!mapping) { 330 if (!mapping) {
319 /* 331 /*
320 * Some data journaling orphaned pages can have 332 * Some data journaling orphaned pages can have
321 * page->mapping == NULL while being dirty with clean buffers. 333 * page->mapping == NULL while being dirty with clean buffers.
322 */ 334 */
323 if (PagePrivate(page)) { 335 if (PagePrivate(page)) {
324 if (try_to_free_buffers(page)) { 336 if (try_to_free_buffers(page)) {
325 ClearPageDirty(page); 337 ClearPageDirty(page);
326 printk("%s: orphaned page\n", __FUNCTION__); 338 printk("%s: orphaned page\n", __FUNCTION__);
327 return PAGE_CLEAN; 339 return PAGE_CLEAN;
328 } 340 }
329 } 341 }
330 return PAGE_KEEP; 342 return PAGE_KEEP;
331 } 343 }
332 if (mapping->a_ops->writepage == NULL) 344 if (mapping->a_ops->writepage == NULL)
333 return PAGE_ACTIVATE; 345 return PAGE_ACTIVATE;
334 if (!may_write_to_queue(mapping->backing_dev_info)) 346 if (!may_write_to_queue(mapping->backing_dev_info))
335 return PAGE_KEEP; 347 return PAGE_KEEP;
336 348
337 if (clear_page_dirty_for_io(page)) { 349 if (clear_page_dirty_for_io(page)) {
338 int res; 350 int res;
339 struct writeback_control wbc = { 351 struct writeback_control wbc = {
340 .sync_mode = WB_SYNC_NONE, 352 .sync_mode = WB_SYNC_NONE,
341 .nr_to_write = SWAP_CLUSTER_MAX, 353 .nr_to_write = SWAP_CLUSTER_MAX,
342 .range_start = 0, 354 .range_start = 0,
343 .range_end = LLONG_MAX, 355 .range_end = LLONG_MAX,
344 .nonblocking = 1, 356 .nonblocking = 1,
345 .for_reclaim = 1, 357 .for_reclaim = 1,
346 }; 358 };
347 359
348 SetPageReclaim(page); 360 SetPageReclaim(page);
349 res = mapping->a_ops->writepage(page, &wbc); 361 res = mapping->a_ops->writepage(page, &wbc);
350 if (res < 0) 362 if (res < 0)
351 handle_write_error(mapping, page, res); 363 handle_write_error(mapping, page, res);
352 if (res == AOP_WRITEPAGE_ACTIVATE) { 364 if (res == AOP_WRITEPAGE_ACTIVATE) {
353 ClearPageReclaim(page); 365 ClearPageReclaim(page);
354 return PAGE_ACTIVATE; 366 return PAGE_ACTIVATE;
355 } 367 }
356 if (!PageWriteback(page)) { 368 if (!PageWriteback(page)) {
357 /* synchronous write or broken a_ops? */ 369 /* synchronous write or broken a_ops? */
358 ClearPageReclaim(page); 370 ClearPageReclaim(page);
359 } 371 }
360 372
361 return PAGE_SUCCESS; 373 return PAGE_SUCCESS;
362 } 374 }
363 375
364 return PAGE_CLEAN; 376 return PAGE_CLEAN;
365 } 377 }
366 378
367 int remove_mapping(struct address_space *mapping, struct page *page) 379 int remove_mapping(struct address_space *mapping, struct page *page)
368 { 380 {
369 if (!mapping) 381 if (!mapping)
370 return 0; /* truncate got there first */ 382 return 0; /* truncate got there first */
371 383
372 write_lock_irq(&mapping->tree_lock); 384 write_lock_irq(&mapping->tree_lock);
373 385
374 /* 386 /*
375 * The non-racy check for busy page. It is critical to check 387 * The non-racy check for busy page. It is critical to check
376 * PageDirty _after_ making sure that the page is freeable and 388 * PageDirty _after_ making sure that the page is freeable and
377 * not in use by anybody. (pagecache + us == 2) 389 * not in use by anybody. (pagecache + us == 2)
378 */ 390 */
379 if (unlikely(page_count(page) != 2)) 391 if (unlikely(page_count(page) != 2))
380 goto cannot_free; 392 goto cannot_free;
381 smp_rmb(); 393 smp_rmb();
382 if (unlikely(PageDirty(page))) 394 if (unlikely(PageDirty(page)))
383 goto cannot_free; 395 goto cannot_free;
384 396
385 if (PageSwapCache(page)) { 397 if (PageSwapCache(page)) {
386 swp_entry_t swap = { .val = page_private(page) }; 398 swp_entry_t swap = { .val = page_private(page) };
387 __delete_from_swap_cache(page); 399 __delete_from_swap_cache(page);
388 write_unlock_irq(&mapping->tree_lock); 400 write_unlock_irq(&mapping->tree_lock);
389 swap_free(swap); 401 swap_free(swap);
390 __put_page(page); /* The pagecache ref */ 402 __put_page(page); /* The pagecache ref */
391 return 1; 403 return 1;
392 } 404 }
393 405
394 __remove_from_page_cache(page); 406 __remove_from_page_cache(page);
395 write_unlock_irq(&mapping->tree_lock); 407 write_unlock_irq(&mapping->tree_lock);
396 __put_page(page); 408 __put_page(page);
397 return 1; 409 return 1;
398 410
399 cannot_free: 411 cannot_free:
400 write_unlock_irq(&mapping->tree_lock); 412 write_unlock_irq(&mapping->tree_lock);
401 return 0; 413 return 0;
402 } 414 }
403 415
404 /* 416 /*
405 * shrink_page_list() returns the number of reclaimed pages 417 * shrink_page_list() returns the number of reclaimed pages
406 */ 418 */
407 static unsigned long shrink_page_list(struct list_head *page_list, 419 static unsigned long shrink_page_list(struct list_head *page_list,
408 struct scan_control *sc) 420 struct scan_control *sc)
409 { 421 {
410 LIST_HEAD(ret_pages); 422 LIST_HEAD(ret_pages);
411 struct pagevec freed_pvec; 423 struct pagevec freed_pvec;
412 int pgactivate = 0; 424 int pgactivate = 0;
413 unsigned long nr_reclaimed = 0; 425 unsigned long nr_reclaimed = 0;
414 426
415 cond_resched(); 427 cond_resched();
416 428
417 pagevec_init(&freed_pvec, 1); 429 pagevec_init(&freed_pvec, 1);
418 while (!list_empty(page_list)) { 430 while (!list_empty(page_list)) {
419 struct address_space *mapping; 431 struct address_space *mapping;
420 struct page *page; 432 struct page *page;
421 int may_enter_fs; 433 int may_enter_fs;
422 int referenced; 434 int referenced;
423 435
424 cond_resched(); 436 cond_resched();
425 437
426 page = lru_to_page(page_list); 438 page = lru_to_page(page_list);
427 list_del(&page->lru); 439 list_del(&page->lru);
428 440
429 if (TestSetPageLocked(page)) 441 if (TestSetPageLocked(page))
430 goto keep; 442 goto keep;
431 443
432 BUG_ON(PageActive(page)); 444 BUG_ON(PageActive(page));
433 445
434 sc->nr_scanned++; 446 sc->nr_scanned++;
435 447
436 if (!sc->may_swap && page_mapped(page)) 448 if (!sc->may_swap && page_mapped(page))
437 goto keep_locked; 449 goto keep_locked;
438 450
439 /* Double the slab pressure for mapped and swapcache pages */ 451 /* Double the slab pressure for mapped and swapcache pages */
440 if (page_mapped(page) || PageSwapCache(page)) 452 if (page_mapped(page) || PageSwapCache(page))
441 sc->nr_scanned++; 453 sc->nr_scanned++;
442 454
443 if (PageWriteback(page)) 455 if (PageWriteback(page))
444 goto keep_locked; 456 goto keep_locked;
445 457
446 referenced = page_referenced(page, 1); 458 referenced = page_referenced(page, 1);
447 /* In active use or really unfreeable? Activate it. */ 459 /* In active use or really unfreeable? Activate it. */
448 if (referenced && page_mapping_inuse(page)) 460 if (referenced && page_mapping_inuse(page))
449 goto activate_locked; 461 goto activate_locked;
450 462
451 #ifdef CONFIG_SWAP 463 #ifdef CONFIG_SWAP
452 /* 464 /*
453 * Anonymous process memory has backing store? 465 * Anonymous process memory has backing store?
454 * Try to allocate it some swap space here. 466 * Try to allocate it some swap space here.
455 */ 467 */
456 if (PageAnon(page) && !PageSwapCache(page)) 468 if (PageAnon(page) && !PageSwapCache(page))
457 if (!add_to_swap(page, GFP_ATOMIC)) 469 if (!add_to_swap(page, GFP_ATOMIC))
458 goto activate_locked; 470 goto activate_locked;
459 #endif /* CONFIG_SWAP */ 471 #endif /* CONFIG_SWAP */
460 472
461 mapping = page_mapping(page); 473 mapping = page_mapping(page);
462 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 474 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
463 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 475 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
464 476
465 /* 477 /*
466 * The page is mapped into the page tables of one or more 478 * The page is mapped into the page tables of one or more
467 * processes. Try to unmap it here. 479 * processes. Try to unmap it here.
468 */ 480 */
469 if (page_mapped(page) && mapping) { 481 if (page_mapped(page) && mapping) {
470 switch (try_to_unmap(page, 0)) { 482 switch (try_to_unmap(page, 0)) {
471 case SWAP_FAIL: 483 case SWAP_FAIL:
472 goto activate_locked; 484 goto activate_locked;
473 case SWAP_AGAIN: 485 case SWAP_AGAIN:
474 goto keep_locked; 486 goto keep_locked;
475 case SWAP_SUCCESS: 487 case SWAP_SUCCESS:
476 ; /* try to free the page below */ 488 ; /* try to free the page below */
477 } 489 }
478 } 490 }
479 491
480 if (PageDirty(page)) { 492 if (PageDirty(page)) {
481 if (referenced) 493 if (referenced)
482 goto keep_locked; 494 goto keep_locked;
483 if (!may_enter_fs) 495 if (!may_enter_fs)
484 goto keep_locked; 496 goto keep_locked;
485 if (!sc->may_writepage) 497 if (!sc->may_writepage)
486 goto keep_locked; 498 goto keep_locked;
487 499
488 /* Page is dirty, try to write it out here */ 500 /* Page is dirty, try to write it out here */
489 switch(pageout(page, mapping)) { 501 switch(pageout(page, mapping)) {
490 case PAGE_KEEP: 502 case PAGE_KEEP:
491 goto keep_locked; 503 goto keep_locked;
492 case PAGE_ACTIVATE: 504 case PAGE_ACTIVATE:
493 goto activate_locked; 505 goto activate_locked;
494 case PAGE_SUCCESS: 506 case PAGE_SUCCESS:
495 if (PageWriteback(page) || PageDirty(page)) 507 if (PageWriteback(page) || PageDirty(page))
496 goto keep; 508 goto keep;
497 /* 509 /*
498 * A synchronous write - probably a ramdisk. Go 510 * A synchronous write - probably a ramdisk. Go
499 * ahead and try to reclaim the page. 511 * ahead and try to reclaim the page.
500 */ 512 */
501 if (TestSetPageLocked(page)) 513 if (TestSetPageLocked(page))
502 goto keep; 514 goto keep;
503 if (PageDirty(page) || PageWriteback(page)) 515 if (PageDirty(page) || PageWriteback(page))
504 goto keep_locked; 516 goto keep_locked;
505 mapping = page_mapping(page); 517 mapping = page_mapping(page);
506 case PAGE_CLEAN: 518 case PAGE_CLEAN:
507 ; /* try to free the page below */ 519 ; /* try to free the page below */
508 } 520 }
509 } 521 }
510 522
511 /* 523 /*
512 * If the page has buffers, try to free the buffer mappings 524 * If the page has buffers, try to free the buffer mappings
513 * associated with this page. If we succeed we try to free 525 * associated with this page. If we succeed we try to free
514 * the page as well. 526 * the page as well.
515 * 527 *
516 * We do this even if the page is PageDirty(). 528 * We do this even if the page is PageDirty().
517 * try_to_release_page() does not perform I/O, but it is 529 * try_to_release_page() does not perform I/O, but it is
518 * possible for a page to have PageDirty set, but it is actually 530 * possible for a page to have PageDirty set, but it is actually
519 * clean (all its buffers are clean). This happens if the 531 * clean (all its buffers are clean). This happens if the
520 * buffers were written out directly, with submit_bh(). ext3 532 * buffers were written out directly, with submit_bh(). ext3
521 * will do this, as well as the blockdev mapping. 533 * will do this, as well as the blockdev mapping.
522 * try_to_release_page() will discover that cleanness and will 534 * try_to_release_page() will discover that cleanness and will
523 * drop the buffers and mark the page clean - it can be freed. 535 * drop the buffers and mark the page clean - it can be freed.
524 * 536 *
525 * Rarely, pages can have buffers and no ->mapping. These are 537 * Rarely, pages can have buffers and no ->mapping. These are
526 * the pages which were not successfully invalidated in 538 * the pages which were not successfully invalidated in
527 * truncate_complete_page(). We try to drop those buffers here 539 * truncate_complete_page(). We try to drop those buffers here
528 * and if that worked, and the page is no longer mapped into 540 * and if that worked, and the page is no longer mapped into
529 * process address space (page_count == 1) it can be freed. 541 * process address space (page_count == 1) it can be freed.
530 * Otherwise, leave the page on the LRU so it is swappable. 542 * Otherwise, leave the page on the LRU so it is swappable.
531 */ 543 */
532 if (PagePrivate(page)) { 544 if (PagePrivate(page)) {
533 if (!try_to_release_page(page, sc->gfp_mask)) 545 if (!try_to_release_page(page, sc->gfp_mask))
534 goto activate_locked; 546 goto activate_locked;
535 if (!mapping && page_count(page) == 1) 547 if (!mapping && page_count(page) == 1)
536 goto free_it; 548 goto free_it;
537 } 549 }
538 550
539 if (!remove_mapping(mapping, page)) 551 if (!remove_mapping(mapping, page))
540 goto keep_locked; 552 goto keep_locked;
541 553
542 free_it: 554 free_it:
543 unlock_page(page); 555 unlock_page(page);
544 nr_reclaimed++; 556 nr_reclaimed++;
545 if (!pagevec_add(&freed_pvec, page)) 557 if (!pagevec_add(&freed_pvec, page))
546 __pagevec_release_nonlru(&freed_pvec); 558 __pagevec_release_nonlru(&freed_pvec);
547 continue; 559 continue;
548 560
549 activate_locked: 561 activate_locked:
550 SetPageActive(page); 562 SetPageActive(page);
551 pgactivate++; 563 pgactivate++;
552 keep_locked: 564 keep_locked:
553 unlock_page(page); 565 unlock_page(page);
554 keep: 566 keep:
555 list_add(&page->lru, &ret_pages); 567 list_add(&page->lru, &ret_pages);
556 BUG_ON(PageLRU(page)); 568 BUG_ON(PageLRU(page));
557 } 569 }
558 list_splice(&ret_pages, page_list); 570 list_splice(&ret_pages, page_list);
559 if (pagevec_count(&freed_pvec)) 571 if (pagevec_count(&freed_pvec))
560 __pagevec_release_nonlru(&freed_pvec); 572 __pagevec_release_nonlru(&freed_pvec);
561 mod_page_state(pgactivate, pgactivate); 573 mod_page_state(pgactivate, pgactivate);
562 return nr_reclaimed; 574 return nr_reclaimed;
563 } 575 }
564 576
565 /* 577 /*
566 * zone->lru_lock is heavily contended. Some of the functions that 578 * zone->lru_lock is heavily contended. Some of the functions that
567 * shrink the lists perform better by taking out a batch of pages 579 * shrink the lists perform better by taking out a batch of pages
568 * and working on them outside the LRU lock. 580 * and working on them outside the LRU lock.
569 * 581 *
570 * For pagecache intensive workloads, this function is the hottest 582 * For pagecache intensive workloads, this function is the hottest
571 * spot in the kernel (apart from copy_*_user functions). 583 * spot in the kernel (apart from copy_*_user functions).
572 * 584 *
573 * Appropriate locks must be held before calling this function. 585 * Appropriate locks must be held before calling this function.
574 * 586 *
575 * @nr_to_scan: The number of pages to look through on the list. 587 * @nr_to_scan: The number of pages to look through on the list.
576 * @src: The LRU list to pull pages off. 588 * @src: The LRU list to pull pages off.
577 * @dst: The temp list to put pages on to. 589 * @dst: The temp list to put pages on to.
578 * @scanned: The number of pages that were scanned. 590 * @scanned: The number of pages that were scanned.
579 * 591 *
580 * returns how many pages were moved onto *@dst. 592 * returns how many pages were moved onto *@dst.
581 */ 593 */
582 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 594 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
583 struct list_head *src, struct list_head *dst, 595 struct list_head *src, struct list_head *dst,
584 unsigned long *scanned) 596 unsigned long *scanned)
585 { 597 {
586 unsigned long nr_taken = 0; 598 unsigned long nr_taken = 0;
587 struct page *page; 599 struct page *page;
588 unsigned long scan; 600 unsigned long scan;
589 601
590 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 602 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
591 struct list_head *target; 603 struct list_head *target;
592 page = lru_to_page(src); 604 page = lru_to_page(src);
593 prefetchw_prev_lru_page(page, src, flags); 605 prefetchw_prev_lru_page(page, src, flags);
594 606
595 BUG_ON(!PageLRU(page)); 607 BUG_ON(!PageLRU(page));
596 608
597 list_del(&page->lru); 609 list_del(&page->lru);
598 target = src; 610 target = src;
599 if (likely(get_page_unless_zero(page))) { 611 if (likely(get_page_unless_zero(page))) {
600 /* 612 /*
601 * Be careful not to clear PageLRU until after we're 613 * Be careful not to clear PageLRU until after we're
602 * sure the page is not being freed elsewhere -- the 614 * sure the page is not being freed elsewhere -- the
603 * page release code relies on it. 615 * page release code relies on it.
604 */ 616 */
605 ClearPageLRU(page); 617 ClearPageLRU(page);
606 target = dst; 618 target = dst;
607 nr_taken++; 619 nr_taken++;
608 } /* else it is being freed elsewhere */ 620 } /* else it is being freed elsewhere */
609 621
610 list_add(&page->lru, target); 622 list_add(&page->lru, target);
611 } 623 }
612 624
613 *scanned = scan; 625 *scanned = scan;
614 return nr_taken; 626 return nr_taken;
615 } 627 }
616 628
617 /* 629 /*
618 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 630 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
619 * of reclaimed pages 631 * of reclaimed pages
620 */ 632 */
621 static unsigned long shrink_inactive_list(unsigned long max_scan, 633 static unsigned long shrink_inactive_list(unsigned long max_scan,
622 struct zone *zone, struct scan_control *sc) 634 struct zone *zone, struct scan_control *sc)
623 { 635 {
624 LIST_HEAD(page_list); 636 LIST_HEAD(page_list);
625 struct pagevec pvec; 637 struct pagevec pvec;
626 unsigned long nr_scanned = 0; 638 unsigned long nr_scanned = 0;
627 unsigned long nr_reclaimed = 0; 639 unsigned long nr_reclaimed = 0;
628 640
629 pagevec_init(&pvec, 1); 641 pagevec_init(&pvec, 1);
630 642
631 lru_add_drain(); 643 lru_add_drain();
632 spin_lock_irq(&zone->lru_lock); 644 spin_lock_irq(&zone->lru_lock);
633 do { 645 do {
634 struct page *page; 646 struct page *page;
635 unsigned long nr_taken; 647 unsigned long nr_taken;
636 unsigned long nr_scan; 648 unsigned long nr_scan;
637 unsigned long nr_freed; 649 unsigned long nr_freed;
638 650
639 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 651 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
640 &zone->inactive_list, 652 &zone->inactive_list,
641 &page_list, &nr_scan); 653 &page_list, &nr_scan);
642 zone->nr_inactive -= nr_taken; 654 zone->nr_inactive -= nr_taken;
643 zone->pages_scanned += nr_scan; 655 zone->pages_scanned += nr_scan;
644 spin_unlock_irq(&zone->lru_lock); 656 spin_unlock_irq(&zone->lru_lock);
645 657
646 nr_scanned += nr_scan; 658 nr_scanned += nr_scan;
647 nr_freed = shrink_page_list(&page_list, sc); 659 nr_freed = shrink_page_list(&page_list, sc);
648 nr_reclaimed += nr_freed; 660 nr_reclaimed += nr_freed;
649 local_irq_disable(); 661 local_irq_disable();
650 if (current_is_kswapd()) { 662 if (current_is_kswapd()) {
651 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 663 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
652 __mod_page_state(kswapd_steal, nr_freed); 664 __mod_page_state(kswapd_steal, nr_freed);
653 } else 665 } else
654 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 666 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
655 __mod_page_state_zone(zone, pgsteal, nr_freed); 667 __mod_page_state_zone(zone, pgsteal, nr_freed);
656 668
657 if (nr_taken == 0) 669 if (nr_taken == 0)
658 goto done; 670 goto done;
659 671
660 spin_lock(&zone->lru_lock); 672 spin_lock(&zone->lru_lock);
661 /* 673 /*
662 * Put back any unfreeable pages. 674 * Put back any unfreeable pages.
663 */ 675 */
664 while (!list_empty(&page_list)) { 676 while (!list_empty(&page_list)) {
665 page = lru_to_page(&page_list); 677 page = lru_to_page(&page_list);
666 BUG_ON(PageLRU(page)); 678 BUG_ON(PageLRU(page));
667 SetPageLRU(page); 679 SetPageLRU(page);
668 list_del(&page->lru); 680 list_del(&page->lru);
669 if (PageActive(page)) 681 if (PageActive(page))
670 add_page_to_active_list(zone, page); 682 add_page_to_active_list(zone, page);
671 else 683 else
672 add_page_to_inactive_list(zone, page); 684 add_page_to_inactive_list(zone, page);
673 if (!pagevec_add(&pvec, page)) { 685 if (!pagevec_add(&pvec, page)) {
674 spin_unlock_irq(&zone->lru_lock); 686 spin_unlock_irq(&zone->lru_lock);
675 __pagevec_release(&pvec); 687 __pagevec_release(&pvec);
676 spin_lock_irq(&zone->lru_lock); 688 spin_lock_irq(&zone->lru_lock);
677 } 689 }
678 } 690 }
679 } while (nr_scanned < max_scan); 691 } while (nr_scanned < max_scan);
680 spin_unlock(&zone->lru_lock); 692 spin_unlock(&zone->lru_lock);
681 done: 693 done:
682 local_irq_enable(); 694 local_irq_enable();
683 pagevec_release(&pvec); 695 pagevec_release(&pvec);
684 return nr_reclaimed; 696 return nr_reclaimed;
685 } 697 }
686 698
687 /* 699 /*
688 * This moves pages from the active list to the inactive list. 700 * This moves pages from the active list to the inactive list.
689 * 701 *
690 * We move them the other way if the page is referenced by one or more 702 * We move them the other way if the page is referenced by one or more
691 * processes, from rmap. 703 * processes, from rmap.
692 * 704 *
693 * If the pages are mostly unmapped, the processing is fast and it is 705 * If the pages are mostly unmapped, the processing is fast and it is
694 * appropriate to hold zone->lru_lock across the whole operation. But if 706 * appropriate to hold zone->lru_lock across the whole operation. But if
695 * the pages are mapped, the processing is slow (page_referenced()) so we 707 * the pages are mapped, the processing is slow (page_referenced()) so we
696 * should drop zone->lru_lock around each page. It's impossible to balance 708 * should drop zone->lru_lock around each page. It's impossible to balance
697 * this, so instead we remove the pages from the LRU while processing them. 709 * this, so instead we remove the pages from the LRU while processing them.
698 * It is safe to rely on PG_active against the non-LRU pages in here because 710 * It is safe to rely on PG_active against the non-LRU pages in here because
699 * nobody will play with that bit on a non-LRU page. 711 * nobody will play with that bit on a non-LRU page.
700 * 712 *
701 * The downside is that we have to touch page->_count against each page. 713 * The downside is that we have to touch page->_count against each page.
702 * But we had to alter page->flags anyway. 714 * But we had to alter page->flags anyway.
703 */ 715 */
704 static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 716 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
705 struct scan_control *sc) 717 struct scan_control *sc)
706 { 718 {
707 unsigned long pgmoved; 719 unsigned long pgmoved;
708 int pgdeactivate = 0; 720 int pgdeactivate = 0;
709 unsigned long pgscanned; 721 unsigned long pgscanned;
710 LIST_HEAD(l_hold); /* The pages which were snipped off */ 722 LIST_HEAD(l_hold); /* The pages which were snipped off */
711 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 723 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
712 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 724 LIST_HEAD(l_active); /* Pages to go onto the active_list */
713 struct page *page; 725 struct page *page;
714 struct pagevec pvec; 726 struct pagevec pvec;
715 int reclaim_mapped = 0; 727 int reclaim_mapped = 0;
716 728
717 if (sc->may_swap) { 729 if (sc->may_swap) {
718 long mapped_ratio; 730 long mapped_ratio;
719 long distress; 731 long distress;
720 long swap_tendency; 732 long swap_tendency;
721 733
722 /* 734 /*
723 * `distress' is a measure of how much trouble we're having 735 * `distress' is a measure of how much trouble we're having
724 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 736 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
725 */ 737 */
726 distress = 100 >> zone->prev_priority; 738 distress = 100 >> zone->prev_priority;
727 739
728 /* 740 /*
729 * The point of this algorithm is to decide when to start 741 * The point of this algorithm is to decide when to start
730 * reclaiming mapped memory instead of just pagecache. Work out 742 * reclaiming mapped memory instead of just pagecache. Work out
731 * how much memory 743 * how much memory
732 * is mapped. 744 * is mapped.
733 */ 745 */
734 mapped_ratio = (sc->nr_mapped * 100) / total_memory; 746 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
735 747
736 /* 748 /*
737 * Now decide how much we really want to unmap some pages. The 749 * Now decide how much we really want to unmap some pages. The
738 * mapped ratio is downgraded - just because there's a lot of 750 * mapped ratio is downgraded - just because there's a lot of
739 * mapped memory doesn't necessarily mean that page reclaim 751 * mapped memory doesn't necessarily mean that page reclaim
740 * isn't succeeding. 752 * isn't succeeding.
741 * 753 *
742 * The distress ratio is important - we don't want to start 754 * The distress ratio is important - we don't want to start
743 * going oom. 755 * going oom.
744 * 756 *
745 * A 100% value of vm_swappiness overrides this algorithm 757 * A 100% value of vm_swappiness overrides this algorithm
746 * altogether. 758 * altogether.
747 */ 759 */
748 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 760 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
749 761
750 /* 762 /*
751 * Now use this metric to decide whether to start moving mapped 763 * Now use this metric to decide whether to start moving mapped
752 * memory onto the inactive list. 764 * memory onto the inactive list.
753 */ 765 */
754 if (swap_tendency >= 100) 766 if (swap_tendency >= 100)
755 reclaim_mapped = 1; 767 reclaim_mapped = 1;
756 } 768 }
757 769
758 lru_add_drain(); 770 lru_add_drain();
759 spin_lock_irq(&zone->lru_lock); 771 spin_lock_irq(&zone->lru_lock);
760 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 772 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
761 &l_hold, &pgscanned); 773 &l_hold, &pgscanned);
762 zone->pages_scanned += pgscanned; 774 zone->pages_scanned += pgscanned;
763 zone->nr_active -= pgmoved; 775 zone->nr_active -= pgmoved;
764 spin_unlock_irq(&zone->lru_lock); 776 spin_unlock_irq(&zone->lru_lock);
765 777
766 while (!list_empty(&l_hold)) { 778 while (!list_empty(&l_hold)) {
767 cond_resched(); 779 cond_resched();
768 page = lru_to_page(&l_hold); 780 page = lru_to_page(&l_hold);
769 list_del(&page->lru); 781 list_del(&page->lru);
770 if (page_mapped(page)) { 782 if (page_mapped(page)) {
771 if (!reclaim_mapped || 783 if (!reclaim_mapped ||
772 (total_swap_pages == 0 && PageAnon(page)) || 784 (total_swap_pages == 0 && PageAnon(page)) ||
773 page_referenced(page, 0)) { 785 page_referenced(page, 0)) {
774 list_add(&page->lru, &l_active); 786 list_add(&page->lru, &l_active);
775 continue; 787 continue;
776 } 788 }
777 } 789 }
778 list_add(&page->lru, &l_inactive); 790 list_add(&page->lru, &l_inactive);
779 } 791 }
780 792
781 pagevec_init(&pvec, 1); 793 pagevec_init(&pvec, 1);
782 pgmoved = 0; 794 pgmoved = 0;
783 spin_lock_irq(&zone->lru_lock); 795 spin_lock_irq(&zone->lru_lock);
784 while (!list_empty(&l_inactive)) { 796 while (!list_empty(&l_inactive)) {
785 page = lru_to_page(&l_inactive); 797 page = lru_to_page(&l_inactive);
786 prefetchw_prev_lru_page(page, &l_inactive, flags); 798 prefetchw_prev_lru_page(page, &l_inactive, flags);
787 BUG_ON(PageLRU(page)); 799 BUG_ON(PageLRU(page));
788 SetPageLRU(page); 800 SetPageLRU(page);
789 BUG_ON(!PageActive(page)); 801 BUG_ON(!PageActive(page));
790 ClearPageActive(page); 802 ClearPageActive(page);
791 803
792 list_move(&page->lru, &zone->inactive_list); 804 list_move(&page->lru, &zone->inactive_list);
793 pgmoved++; 805 pgmoved++;
794 if (!pagevec_add(&pvec, page)) { 806 if (!pagevec_add(&pvec, page)) {
795 zone->nr_inactive += pgmoved; 807 zone->nr_inactive += pgmoved;
796 spin_unlock_irq(&zone->lru_lock); 808 spin_unlock_irq(&zone->lru_lock);
797 pgdeactivate += pgmoved; 809 pgdeactivate += pgmoved;
798 pgmoved = 0; 810 pgmoved = 0;
799 if (buffer_heads_over_limit) 811 if (buffer_heads_over_limit)
800 pagevec_strip(&pvec); 812 pagevec_strip(&pvec);
801 __pagevec_release(&pvec); 813 __pagevec_release(&pvec);
802 spin_lock_irq(&zone->lru_lock); 814 spin_lock_irq(&zone->lru_lock);
803 } 815 }
804 } 816 }
805 zone->nr_inactive += pgmoved; 817 zone->nr_inactive += pgmoved;
806 pgdeactivate += pgmoved; 818 pgdeactivate += pgmoved;
807 if (buffer_heads_over_limit) { 819 if (buffer_heads_over_limit) {
808 spin_unlock_irq(&zone->lru_lock); 820 spin_unlock_irq(&zone->lru_lock);
809 pagevec_strip(&pvec); 821 pagevec_strip(&pvec);
810 spin_lock_irq(&zone->lru_lock); 822 spin_lock_irq(&zone->lru_lock);
811 } 823 }
812 824
813 pgmoved = 0; 825 pgmoved = 0;
814 while (!list_empty(&l_active)) { 826 while (!list_empty(&l_active)) {
815 page = lru_to_page(&l_active); 827 page = lru_to_page(&l_active);
816 prefetchw_prev_lru_page(page, &l_active, flags); 828 prefetchw_prev_lru_page(page, &l_active, flags);
817 BUG_ON(PageLRU(page)); 829 BUG_ON(PageLRU(page));
818 SetPageLRU(page); 830 SetPageLRU(page);
819 BUG_ON(!PageActive(page)); 831 BUG_ON(!PageActive(page));
820 list_move(&page->lru, &zone->active_list); 832 list_move(&page->lru, &zone->active_list);
821 pgmoved++; 833 pgmoved++;
822 if (!pagevec_add(&pvec, page)) { 834 if (!pagevec_add(&pvec, page)) {
823 zone->nr_active += pgmoved; 835 zone->nr_active += pgmoved;
824 pgmoved = 0; 836 pgmoved = 0;
825 spin_unlock_irq(&zone->lru_lock); 837 spin_unlock_irq(&zone->lru_lock);
826 __pagevec_release(&pvec); 838 __pagevec_release(&pvec);
827 spin_lock_irq(&zone->lru_lock); 839 spin_lock_irq(&zone->lru_lock);
828 } 840 }
829 } 841 }
830 zone->nr_active += pgmoved; 842 zone->nr_active += pgmoved;
831 spin_unlock(&zone->lru_lock); 843 spin_unlock(&zone->lru_lock);
832 844
833 __mod_page_state_zone(zone, pgrefill, pgscanned); 845 __mod_page_state_zone(zone, pgrefill, pgscanned);
834 __mod_page_state(pgdeactivate, pgdeactivate); 846 __mod_page_state(pgdeactivate, pgdeactivate);
835 local_irq_enable(); 847 local_irq_enable();
836 848
837 pagevec_release(&pvec); 849 pagevec_release(&pvec);
838 } 850 }
839 851
840 /* 852 /*
841 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 853 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
842 */ 854 */
843 static unsigned long shrink_zone(int priority, struct zone *zone, 855 static unsigned long shrink_zone(int priority, struct zone *zone,
844 struct scan_control *sc) 856 struct scan_control *sc)
845 { 857 {
846 unsigned long nr_active; 858 unsigned long nr_active;
847 unsigned long nr_inactive; 859 unsigned long nr_inactive;
848 unsigned long nr_to_scan; 860 unsigned long nr_to_scan;
849 unsigned long nr_reclaimed = 0; 861 unsigned long nr_reclaimed = 0;
850 862
851 atomic_inc(&zone->reclaim_in_progress); 863 atomic_inc(&zone->reclaim_in_progress);
852 864
853 /* 865 /*
854 * Add one to `nr_to_scan' just to make sure that the kernel will 866 * Add one to `nr_to_scan' just to make sure that the kernel will
855 * slowly sift through the active list. 867 * slowly sift through the active list.
856 */ 868 */
857 zone->nr_scan_active += (zone->nr_active >> priority) + 1; 869 zone->nr_scan_active += (zone->nr_active >> priority) + 1;
858 nr_active = zone->nr_scan_active; 870 nr_active = zone->nr_scan_active;
859 if (nr_active >= sc->swap_cluster_max) 871 if (nr_active >= sc->swap_cluster_max)
860 zone->nr_scan_active = 0; 872 zone->nr_scan_active = 0;
861 else 873 else
862 nr_active = 0; 874 nr_active = 0;
863 875
864 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; 876 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
865 nr_inactive = zone->nr_scan_inactive; 877 nr_inactive = zone->nr_scan_inactive;
866 if (nr_inactive >= sc->swap_cluster_max) 878 if (nr_inactive >= sc->swap_cluster_max)
867 zone->nr_scan_inactive = 0; 879 zone->nr_scan_inactive = 0;
868 else 880 else
869 nr_inactive = 0; 881 nr_inactive = 0;
870 882
871 while (nr_active || nr_inactive) { 883 while (nr_active || nr_inactive) {
872 if (nr_active) { 884 if (nr_active) {
873 nr_to_scan = min(nr_active, 885 nr_to_scan = min(nr_active,
874 (unsigned long)sc->swap_cluster_max); 886 (unsigned long)sc->swap_cluster_max);
875 nr_active -= nr_to_scan; 887 nr_active -= nr_to_scan;
876 shrink_active_list(nr_to_scan, zone, sc); 888 shrink_active_list(nr_to_scan, zone, sc);
877 } 889 }
878 890
879 if (nr_inactive) { 891 if (nr_inactive) {
880 nr_to_scan = min(nr_inactive, 892 nr_to_scan = min(nr_inactive,
881 (unsigned long)sc->swap_cluster_max); 893 (unsigned long)sc->swap_cluster_max);
882 nr_inactive -= nr_to_scan; 894 nr_inactive -= nr_to_scan;
883 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 895 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
884 sc); 896 sc);
885 } 897 }
886 } 898 }
887 899
888 throttle_vm_writeout(); 900 throttle_vm_writeout();
889 901
890 atomic_dec(&zone->reclaim_in_progress); 902 atomic_dec(&zone->reclaim_in_progress);
891 return nr_reclaimed; 903 return nr_reclaimed;
892 } 904 }
893 905
894 /* 906 /*
895 * This is the direct reclaim path, for page-allocating processes. We only 907 * This is the direct reclaim path, for page-allocating processes. We only
896 * try to reclaim pages from zones which will satisfy the caller's allocation 908 * try to reclaim pages from zones which will satisfy the caller's allocation
897 * request. 909 * request.
898 * 910 *
899 * We reclaim from a zone even if that zone is over pages_high. Because: 911 * We reclaim from a zone even if that zone is over pages_high. Because:
900 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 912 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
901 * allocation or 913 * allocation or
902 * b) The zones may be over pages_high but they must go *over* pages_high to 914 * b) The zones may be over pages_high but they must go *over* pages_high to
903 * satisfy the `incremental min' zone defense algorithm. 915 * satisfy the `incremental min' zone defense algorithm.
904 * 916 *
905 * Returns the number of reclaimed pages. 917 * Returns the number of reclaimed pages.
906 * 918 *
907 * If a zone is deemed to be full of pinned pages then just give it a light 919 * If a zone is deemed to be full of pinned pages then just give it a light
908 * scan then give up on it. 920 * scan then give up on it.
909 */ 921 */
910 static unsigned long shrink_zones(int priority, struct zone **zones, 922 static unsigned long shrink_zones(int priority, struct zone **zones,
911 struct scan_control *sc) 923 struct scan_control *sc)
912 { 924 {
913 unsigned long nr_reclaimed = 0; 925 unsigned long nr_reclaimed = 0;
914 int i; 926 int i;
915 927
916 for (i = 0; zones[i] != NULL; i++) { 928 for (i = 0; zones[i] != NULL; i++) {
917 struct zone *zone = zones[i]; 929 struct zone *zone = zones[i];
918 930
919 if (!populated_zone(zone)) 931 if (!populated_zone(zone))
920 continue; 932 continue;
921 933
922 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 934 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
923 continue; 935 continue;
924 936
925 zone->temp_priority = priority; 937 zone->temp_priority = priority;
926 if (zone->prev_priority > priority) 938 if (zone->prev_priority > priority)
927 zone->prev_priority = priority; 939 zone->prev_priority = priority;
928 940
929 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 941 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
930 continue; /* Let kswapd poll it */ 942 continue; /* Let kswapd poll it */
931 943
932 nr_reclaimed += shrink_zone(priority, zone, sc); 944 nr_reclaimed += shrink_zone(priority, zone, sc);
933 } 945 }
934 return nr_reclaimed; 946 return nr_reclaimed;
935 } 947 }
936 948
937 /* 949 /*
938 * This is the main entry point to direct page reclaim. 950 * This is the main entry point to direct page reclaim.
939 * 951 *
940 * If a full scan of the inactive list fails to free enough memory then we 952 * If a full scan of the inactive list fails to free enough memory then we
941 * are "out of memory" and something needs to be killed. 953 * are "out of memory" and something needs to be killed.
942 * 954 *
943 * If the caller is !__GFP_FS then the probability of a failure is reasonably 955 * If the caller is !__GFP_FS then the probability of a failure is reasonably
944 * high - the zone may be full of dirty or under-writeback pages, which this 956 * high - the zone may be full of dirty or under-writeback pages, which this
945 * caller can't do much about. We kick pdflush and take explicit naps in the 957 * caller can't do much about. We kick pdflush and take explicit naps in the
946 * hope that some of these pages can be written. But if the allocating task 958 * hope that some of these pages can be written. But if the allocating task
947 * holds filesystem locks which prevent writeout this might not work, and the 959 * holds filesystem locks which prevent writeout this might not work, and the
948 * allocation attempt will fail. 960 * allocation attempt will fail.
949 */ 961 */
950 unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 962 unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
951 { 963 {
952 int priority; 964 int priority;
953 int ret = 0; 965 int ret = 0;
954 unsigned long total_scanned = 0; 966 unsigned long total_scanned = 0;
955 unsigned long nr_reclaimed = 0; 967 unsigned long nr_reclaimed = 0;
956 struct reclaim_state *reclaim_state = current->reclaim_state; 968 struct reclaim_state *reclaim_state = current->reclaim_state;
957 unsigned long lru_pages = 0; 969 unsigned long lru_pages = 0;
958 int i; 970 int i;
959 struct scan_control sc = { 971 struct scan_control sc = {
960 .gfp_mask = gfp_mask, 972 .gfp_mask = gfp_mask,
961 .may_writepage = !laptop_mode, 973 .may_writepage = !laptop_mode,
962 .swap_cluster_max = SWAP_CLUSTER_MAX, 974 .swap_cluster_max = SWAP_CLUSTER_MAX,
963 .may_swap = 1, 975 .may_swap = 1,
964 .swappiness = vm_swappiness, 976 .swappiness = vm_swappiness,
965 }; 977 };
966 978
967 inc_page_state(allocstall); 979 inc_page_state(allocstall);
968 980
969 for (i = 0; zones[i] != NULL; i++) { 981 for (i = 0; zones[i] != NULL; i++) {
970 struct zone *zone = zones[i]; 982 struct zone *zone = zones[i];
971 983
972 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 984 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
973 continue; 985 continue;
974 986
975 zone->temp_priority = DEF_PRIORITY; 987 zone->temp_priority = DEF_PRIORITY;
976 lru_pages += zone->nr_active + zone->nr_inactive; 988 lru_pages += zone->nr_active + zone->nr_inactive;
977 } 989 }
978 990
979 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 991 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
980 sc.nr_mapped = read_page_state(nr_mapped); 992 sc.nr_mapped = read_page_state(nr_mapped);
981 sc.nr_scanned = 0; 993 sc.nr_scanned = 0;
982 if (!priority) 994 if (!priority)
983 disable_swap_token(); 995 disable_swap_token();
984 nr_reclaimed += shrink_zones(priority, zones, &sc); 996 nr_reclaimed += shrink_zones(priority, zones, &sc);
985 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 997 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
986 if (reclaim_state) { 998 if (reclaim_state) {
987 nr_reclaimed += reclaim_state->reclaimed_slab; 999 nr_reclaimed += reclaim_state->reclaimed_slab;
988 reclaim_state->reclaimed_slab = 0; 1000 reclaim_state->reclaimed_slab = 0;
989 } 1001 }
990 total_scanned += sc.nr_scanned; 1002 total_scanned += sc.nr_scanned;
991 if (nr_reclaimed >= sc.swap_cluster_max) { 1003 if (nr_reclaimed >= sc.swap_cluster_max) {
992 ret = 1; 1004 ret = 1;
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
996 /* 1008 /*
997 * Try to write back as many pages as we just scanned. This 1009 * Try to write back as many pages as we just scanned. This
998 * tends to cause slow streaming writers to write data to the 1010 * tends to cause slow streaming writers to write data to the
999 * disk smoothly, at the dirtying rate, which is nice. But 1011 * disk smoothly, at the dirtying rate, which is nice. But
1000 * that's undesirable in laptop mode, where we *want* lumpy 1012 * that's undesirable in laptop mode, where we *want* lumpy
1001 * writeout. So in laptop mode, write out the whole world. 1013 * writeout. So in laptop mode, write out the whole world.
1002 */ 1014 */
1003 if (total_scanned > sc.swap_cluster_max + 1015 if (total_scanned > sc.swap_cluster_max +
1004 sc.swap_cluster_max / 2) { 1016 sc.swap_cluster_max / 2) {
1005 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1017 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1006 sc.may_writepage = 1; 1018 sc.may_writepage = 1;
1007 } 1019 }
1008 1020
1009 /* Take a nap, wait for some writeback to complete */ 1021 /* Take a nap, wait for some writeback to complete */
1010 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1022 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1011 blk_congestion_wait(WRITE, HZ/10); 1023 blk_congestion_wait(WRITE, HZ/10);
1012 } 1024 }
1013 out: 1025 out:
1014 for (i = 0; zones[i] != 0; i++) { 1026 for (i = 0; zones[i] != 0; i++) {
1015 struct zone *zone = zones[i]; 1027 struct zone *zone = zones[i];
1016 1028
1017 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1029 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1018 continue; 1030 continue;
1019 1031
1020 zone->prev_priority = zone->temp_priority; 1032 zone->prev_priority = zone->temp_priority;
1021 } 1033 }
1022 return ret; 1034 return ret;
1023 } 1035 }
1024 1036
1025 /* 1037 /*
1026 * For kswapd, balance_pgdat() will work across all this node's zones until 1038 * For kswapd, balance_pgdat() will work across all this node's zones until
1027 * they are all at pages_high. 1039 * they are all at pages_high.
1028 * 1040 *
1029 * Returns the number of pages which were actually freed. 1041 * Returns the number of pages which were actually freed.
1030 * 1042 *
1031 * There is special handling here for zones which are full of pinned pages. 1043 * There is special handling here for zones which are full of pinned pages.
1032 * This can happen if the pages are all mlocked, or if they are all used by 1044 * This can happen if the pages are all mlocked, or if they are all used by
1033 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 1045 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
1034 * What we do is to detect the case where all pages in the zone have been 1046 * What we do is to detect the case where all pages in the zone have been
1035 * scanned twice and there has been zero successful reclaim. Mark the zone as 1047 * scanned twice and there has been zero successful reclaim. Mark the zone as
1036 * dead and from now on, only perform a short scan. Basically we're polling 1048 * dead and from now on, only perform a short scan. Basically we're polling
1037 * the zone for when the problem goes away. 1049 * the zone for when the problem goes away.
1038 * 1050 *
1039 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1051 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1040 * zones which have free_pages > pages_high, but once a zone is found to have 1052 * zones which have free_pages > pages_high, but once a zone is found to have
1041 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1053 * free_pages <= pages_high, we scan that zone and the lower zones regardless
1042 * of the number of free pages in the lower zones. This interoperates with 1054 * of the number of free pages in the lower zones. This interoperates with
1043 * the page allocator fallback scheme to ensure that aging of pages is balanced 1055 * the page allocator fallback scheme to ensure that aging of pages is balanced
1044 * across the zones. 1056 * across the zones.
1045 */ 1057 */
1046 static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1058 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1047 { 1059 {
1048 int all_zones_ok; 1060 int all_zones_ok;
1049 int priority; 1061 int priority;
1050 int i; 1062 int i;
1051 unsigned long total_scanned; 1063 unsigned long total_scanned;
1052 unsigned long nr_reclaimed; 1064 unsigned long nr_reclaimed;
1053 struct reclaim_state *reclaim_state = current->reclaim_state; 1065 struct reclaim_state *reclaim_state = current->reclaim_state;
1054 struct scan_control sc = { 1066 struct scan_control sc = {
1055 .gfp_mask = GFP_KERNEL, 1067 .gfp_mask = GFP_KERNEL,
1056 .may_swap = 1, 1068 .may_swap = 1,
1057 .swap_cluster_max = SWAP_CLUSTER_MAX, 1069 .swap_cluster_max = SWAP_CLUSTER_MAX,
1058 .swappiness = vm_swappiness, 1070 .swappiness = vm_swappiness,
1059 }; 1071 };
1060 1072
1061 loop_again: 1073 loop_again:
1062 total_scanned = 0; 1074 total_scanned = 0;
1063 nr_reclaimed = 0; 1075 nr_reclaimed = 0;
1064 sc.may_writepage = !laptop_mode; 1076 sc.may_writepage = !laptop_mode;
1065 sc.nr_mapped = read_page_state(nr_mapped); 1077 sc.nr_mapped = read_page_state(nr_mapped);
1066 1078
1067 inc_page_state(pageoutrun); 1079 inc_page_state(pageoutrun);
1068 1080
1069 for (i = 0; i < pgdat->nr_zones; i++) { 1081 for (i = 0; i < pgdat->nr_zones; i++) {
1070 struct zone *zone = pgdat->node_zones + i; 1082 struct zone *zone = pgdat->node_zones + i;
1071 1083
1072 zone->temp_priority = DEF_PRIORITY; 1084 zone->temp_priority = DEF_PRIORITY;
1073 } 1085 }
1074 1086
1075 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1087 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1076 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1088 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1077 unsigned long lru_pages = 0; 1089 unsigned long lru_pages = 0;
1078 1090
1079 /* The swap token gets in the way of swapout... */ 1091 /* The swap token gets in the way of swapout... */
1080 if (!priority) 1092 if (!priority)
1081 disable_swap_token(); 1093 disable_swap_token();
1082 1094
1083 all_zones_ok = 1; 1095 all_zones_ok = 1;
1084 1096
1085 /* 1097 /*
1086 * Scan in the highmem->dma direction for the highest 1098 * Scan in the highmem->dma direction for the highest
1087 * zone which needs scanning 1099 * zone which needs scanning
1088 */ 1100 */
1089 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1101 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1090 struct zone *zone = pgdat->node_zones + i; 1102 struct zone *zone = pgdat->node_zones + i;
1091 1103
1092 if (!populated_zone(zone)) 1104 if (!populated_zone(zone))
1093 continue; 1105 continue;
1094 1106
1095 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1107 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1096 continue; 1108 continue;
1097 1109
1098 if (!zone_watermark_ok(zone, order, zone->pages_high, 1110 if (!zone_watermark_ok(zone, order, zone->pages_high,
1099 0, 0)) { 1111 0, 0)) {
1100 end_zone = i; 1112 end_zone = i;
1101 goto scan; 1113 goto scan;
1102 } 1114 }
1103 } 1115 }
1104 goto out; 1116 goto out;
1105 scan: 1117 scan:
1106 for (i = 0; i <= end_zone; i++) { 1118 for (i = 0; i <= end_zone; i++) {
1107 struct zone *zone = pgdat->node_zones + i; 1119 struct zone *zone = pgdat->node_zones + i;
1108 1120
1109 lru_pages += zone->nr_active + zone->nr_inactive; 1121 lru_pages += zone->nr_active + zone->nr_inactive;
1110 } 1122 }
1111 1123
1112 /* 1124 /*
1113 * Now scan the zone in the dma->highmem direction, stopping 1125 * Now scan the zone in the dma->highmem direction, stopping
1114 * at the last zone which needs scanning. 1126 * at the last zone which needs scanning.
1115 * 1127 *
1116 * We do this because the page allocator works in the opposite 1128 * We do this because the page allocator works in the opposite
1117 * direction. This prevents the page allocator from allocating 1129 * direction. This prevents the page allocator from allocating
1118 * pages behind kswapd's direction of progress, which would 1130 * pages behind kswapd's direction of progress, which would
1119 * cause too much scanning of the lower zones. 1131 * cause too much scanning of the lower zones.
1120 */ 1132 */
1121 for (i = 0; i <= end_zone; i++) { 1133 for (i = 0; i <= end_zone; i++) {
1122 struct zone *zone = pgdat->node_zones + i; 1134 struct zone *zone = pgdat->node_zones + i;
1123 int nr_slab; 1135 int nr_slab;
1124 1136
1125 if (!populated_zone(zone)) 1137 if (!populated_zone(zone))
1126 continue; 1138 continue;
1127 1139
1128 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1140 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1129 continue; 1141 continue;
1130 1142
1131 if (!zone_watermark_ok(zone, order, zone->pages_high, 1143 if (!zone_watermark_ok(zone, order, zone->pages_high,
1132 end_zone, 0)) 1144 end_zone, 0))
1133 all_zones_ok = 0; 1145 all_zones_ok = 0;
1134 zone->temp_priority = priority; 1146 zone->temp_priority = priority;
1135 if (zone->prev_priority > priority) 1147 if (zone->prev_priority > priority)
1136 zone->prev_priority = priority; 1148 zone->prev_priority = priority;
1137 sc.nr_scanned = 0; 1149 sc.nr_scanned = 0;
1138 nr_reclaimed += shrink_zone(priority, zone, &sc); 1150 nr_reclaimed += shrink_zone(priority, zone, &sc);
1139 reclaim_state->reclaimed_slab = 0; 1151 reclaim_state->reclaimed_slab = 0;
1140 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1152 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1141 lru_pages); 1153 lru_pages);
1142 nr_reclaimed += reclaim_state->reclaimed_slab; 1154 nr_reclaimed += reclaim_state->reclaimed_slab;
1143 total_scanned += sc.nr_scanned; 1155 total_scanned += sc.nr_scanned;
1144 if (zone->all_unreclaimable) 1156 if (zone->all_unreclaimable)
1145 continue; 1157 continue;
1146 if (nr_slab == 0 && zone->pages_scanned >= 1158 if (nr_slab == 0 && zone->pages_scanned >=
1147 (zone->nr_active + zone->nr_inactive) * 4) 1159 (zone->nr_active + zone->nr_inactive) * 4)
1148 zone->all_unreclaimable = 1; 1160 zone->all_unreclaimable = 1;
1149 /* 1161 /*
1150 * If we've done a decent amount of scanning and 1162 * If we've done a decent amount of scanning and
1151 * the reclaim ratio is low, start doing writepage 1163 * the reclaim ratio is low, start doing writepage
1152 * even in laptop mode 1164 * even in laptop mode
1153 */ 1165 */
1154 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1166 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1155 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1167 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1156 sc.may_writepage = 1; 1168 sc.may_writepage = 1;
1157 } 1169 }
1158 if (all_zones_ok) 1170 if (all_zones_ok)
1159 break; /* kswapd: all done */ 1171 break; /* kswapd: all done */
1160 /* 1172 /*
1161 * OK, kswapd is getting into trouble. Take a nap, then take 1173 * OK, kswapd is getting into trouble. Take a nap, then take
1162 * another pass across the zones. 1174 * another pass across the zones.
1163 */ 1175 */
1164 if (total_scanned && priority < DEF_PRIORITY - 2) 1176 if (total_scanned && priority < DEF_PRIORITY - 2)
1165 blk_congestion_wait(WRITE, HZ/10); 1177 blk_congestion_wait(WRITE, HZ/10);
1166 1178
1167 /* 1179 /*
1168 * We do this so kswapd doesn't build up large priorities for 1180 * We do this so kswapd doesn't build up large priorities for
1169 * example when it is freeing in parallel with allocators. It 1181 * example when it is freeing in parallel with allocators. It
1170 * matches the direct reclaim path behaviour in terms of impact 1182 * matches the direct reclaim path behaviour in terms of impact
1171 * on zone->*_priority. 1183 * on zone->*_priority.
1172 */ 1184 */
1173 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1185 if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1174 break; 1186 break;
1175 } 1187 }
1176 out: 1188 out:
1177 for (i = 0; i < pgdat->nr_zones; i++) { 1189 for (i = 0; i < pgdat->nr_zones; i++) {
1178 struct zone *zone = pgdat->node_zones + i; 1190 struct zone *zone = pgdat->node_zones + i;
1179 1191
1180 zone->prev_priority = zone->temp_priority; 1192 zone->prev_priority = zone->temp_priority;
1181 } 1193 }
1182 if (!all_zones_ok) { 1194 if (!all_zones_ok) {
1183 cond_resched(); 1195 cond_resched();
1184 goto loop_again; 1196 goto loop_again;
1185 } 1197 }
1186 1198
1187 return nr_reclaimed; 1199 return nr_reclaimed;
1188 } 1200 }
1189 1201
1190 /* 1202 /*
1191 * The background pageout daemon, started as a kernel thread 1203 * The background pageout daemon, started as a kernel thread
1192 * from the init process. 1204 * from the init process.
1193 * 1205 *
1194 * This basically trickles out pages so that we have _some_ 1206 * This basically trickles out pages so that we have _some_
1195 * free memory available even if there is no other activity 1207 * free memory available even if there is no other activity
1196 * that frees anything up. This is needed for things like routing 1208 * that frees anything up. This is needed for things like routing
1197 * etc, where we otherwise might have all activity going on in 1209 * etc, where we otherwise might have all activity going on in
1198 * asynchronous contexts that cannot page things out. 1210 * asynchronous contexts that cannot page things out.
1199 * 1211 *
1200 * If there are applications that are active memory-allocators 1212 * If there are applications that are active memory-allocators
1201 * (most normal use), this basically shouldn't matter. 1213 * (most normal use), this basically shouldn't matter.
1202 */ 1214 */
1203 static int kswapd(void *p) 1215 static int kswapd(void *p)
1204 { 1216 {
1205 unsigned long order; 1217 unsigned long order;
1206 pg_data_t *pgdat = (pg_data_t*)p; 1218 pg_data_t *pgdat = (pg_data_t*)p;
1207 struct task_struct *tsk = current; 1219 struct task_struct *tsk = current;
1208 DEFINE_WAIT(wait); 1220 DEFINE_WAIT(wait);
1209 struct reclaim_state reclaim_state = { 1221 struct reclaim_state reclaim_state = {
1210 .reclaimed_slab = 0, 1222 .reclaimed_slab = 0,
1211 }; 1223 };
1212 cpumask_t cpumask; 1224 cpumask_t cpumask;
1213 1225
1214 daemonize("kswapd%d", pgdat->node_id); 1226 daemonize("kswapd%d", pgdat->node_id);
1215 cpumask = node_to_cpumask(pgdat->node_id); 1227 cpumask = node_to_cpumask(pgdat->node_id);
1216 if (!cpus_empty(cpumask)) 1228 if (!cpus_empty(cpumask))
1217 set_cpus_allowed(tsk, cpumask); 1229 set_cpus_allowed(tsk, cpumask);
1218 current->reclaim_state = &reclaim_state; 1230 current->reclaim_state = &reclaim_state;
1219 1231
1220 /* 1232 /*
1221 * Tell the memory management that we're a "memory allocator", 1233 * Tell the memory management that we're a "memory allocator",
1222 * and that if we need more memory we should get access to it 1234 * and that if we need more memory we should get access to it
1223 * regardless (see "__alloc_pages()"). "kswapd" should 1235 * regardless (see "__alloc_pages()"). "kswapd" should
1224 * never get caught in the normal page freeing logic. 1236 * never get caught in the normal page freeing logic.
1225 * 1237 *
1226 * (Kswapd normally doesn't need memory anyway, but sometimes 1238 * (Kswapd normally doesn't need memory anyway, but sometimes
1227 * you need a small amount of memory in order to be able to 1239 * you need a small amount of memory in order to be able to
1228 * page out something else, and this flag essentially protects 1240 * page out something else, and this flag essentially protects
1229 * us from recursively trying to free more memory as we're 1241 * us from recursively trying to free more memory as we're
1230 * trying to free the first piece of memory in the first place). 1242 * trying to free the first piece of memory in the first place).
1231 */ 1243 */
1232 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 1244 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1233 1245
1234 order = 0; 1246 order = 0;
1235 for ( ; ; ) { 1247 for ( ; ; ) {
1236 unsigned long new_order; 1248 unsigned long new_order;
1237 1249
1238 try_to_freeze(); 1250 try_to_freeze();
1239 1251
1240 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1252 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1241 new_order = pgdat->kswapd_max_order; 1253 new_order = pgdat->kswapd_max_order;
1242 pgdat->kswapd_max_order = 0; 1254 pgdat->kswapd_max_order = 0;
1243 if (order < new_order) { 1255 if (order < new_order) {
1244 /* 1256 /*
1245 * Don't sleep if someone wants a larger 'order' 1257 * Don't sleep if someone wants a larger 'order'
1246 * allocation 1258 * allocation
1247 */ 1259 */
1248 order = new_order; 1260 order = new_order;
1249 } else { 1261 } else {
1250 schedule(); 1262 schedule();
1251 order = pgdat->kswapd_max_order; 1263 order = pgdat->kswapd_max_order;
1252 } 1264 }
1253 finish_wait(&pgdat->kswapd_wait, &wait); 1265 finish_wait(&pgdat->kswapd_wait, &wait);
1254 1266
1255 balance_pgdat(pgdat, order); 1267 balance_pgdat(pgdat, order);
1256 } 1268 }
1257 return 0; 1269 return 0;
1258 } 1270 }
1259 1271
1260 /* 1272 /*
1261 * A zone is low on free memory, so wake its kswapd task to service it. 1273 * A zone is low on free memory, so wake its kswapd task to service it.
1262 */ 1274 */
1263 void wakeup_kswapd(struct zone *zone, int order) 1275 void wakeup_kswapd(struct zone *zone, int order)
1264 { 1276 {
1265 pg_data_t *pgdat; 1277 pg_data_t *pgdat;
1266 1278
1267 if (!populated_zone(zone)) 1279 if (!populated_zone(zone))
1268 return; 1280 return;
1269 1281
1270 pgdat = zone->zone_pgdat; 1282 pgdat = zone->zone_pgdat;
1271 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 1283 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
1272 return; 1284 return;
1273 if (pgdat->kswapd_max_order < order) 1285 if (pgdat->kswapd_max_order < order)
1274 pgdat->kswapd_max_order = order; 1286 pgdat->kswapd_max_order = order;
1275 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1287 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1276 return; 1288 return;
1277 if (!waitqueue_active(&pgdat->kswapd_wait)) 1289 if (!waitqueue_active(&pgdat->kswapd_wait))
1278 return; 1290 return;
1279 wake_up_interruptible(&pgdat->kswapd_wait); 1291 wake_up_interruptible(&pgdat->kswapd_wait);
1280 } 1292 }
1281 1293
1282 #ifdef CONFIG_PM 1294 #ifdef CONFIG_PM
1283 /* 1295 /*
1284 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1296 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
1285 * from LRU lists system-wide, for given pass and priority, and returns the 1297 * from LRU lists system-wide, for given pass and priority, and returns the
1286 * number of reclaimed pages 1298 * number of reclaimed pages
1287 * 1299 *
1288 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1300 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1289 */ 1301 */
1290 static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, 1302 static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1291 int prio, struct scan_control *sc) 1303 int prio, struct scan_control *sc)
1292 { 1304 {
1293 struct zone *zone; 1305 struct zone *zone;
1294 unsigned long nr_to_scan, ret = 0; 1306 unsigned long nr_to_scan, ret = 0;
1295 1307
1296 for_each_zone(zone) { 1308 for_each_zone(zone) {
1297 1309
1298 if (!populated_zone(zone)) 1310 if (!populated_zone(zone))
1299 continue; 1311 continue;
1300 1312
1301 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1313 if (zone->all_unreclaimable && prio != DEF_PRIORITY)
1302 continue; 1314 continue;
1303 1315
1304 /* For pass = 0 we don't shrink the active list */ 1316 /* For pass = 0 we don't shrink the active list */
1305 if (pass > 0) { 1317 if (pass > 0) {
1306 zone->nr_scan_active += (zone->nr_active >> prio) + 1; 1318 zone->nr_scan_active += (zone->nr_active >> prio) + 1;
1307 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1319 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1308 zone->nr_scan_active = 0; 1320 zone->nr_scan_active = 0;
1309 nr_to_scan = min(nr_pages, zone->nr_active); 1321 nr_to_scan = min(nr_pages, zone->nr_active);
1310 shrink_active_list(nr_to_scan, zone, sc); 1322 shrink_active_list(nr_to_scan, zone, sc);
1311 } 1323 }
1312 } 1324 }
1313 1325
1314 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; 1326 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
1315 if (zone->nr_scan_inactive >= nr_pages || pass > 3) { 1327 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1316 zone->nr_scan_inactive = 0; 1328 zone->nr_scan_inactive = 0;
1317 nr_to_scan = min(nr_pages, zone->nr_inactive); 1329 nr_to_scan = min(nr_pages, zone->nr_inactive);
1318 ret += shrink_inactive_list(nr_to_scan, zone, sc); 1330 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1319 if (ret >= nr_pages) 1331 if (ret >= nr_pages)
1320 return ret; 1332 return ret;
1321 } 1333 }
1322 } 1334 }
1323 1335
1324 return ret; 1336 return ret;
1325 } 1337 }
1326 1338
1327 /* 1339 /*
1328 * Try to free `nr_pages' of memory, system-wide, and return the number of 1340 * Try to free `nr_pages' of memory, system-wide, and return the number of
1329 * freed pages. 1341 * freed pages.
1330 * 1342 *
1331 * Rather than trying to age LRUs the aim is to preserve the overall 1343 * Rather than trying to age LRUs the aim is to preserve the overall
1332 * LRU order by reclaiming preferentially 1344 * LRU order by reclaiming preferentially
1333 * inactive > active > active referenced > active mapped 1345 * inactive > active > active referenced > active mapped
1334 */ 1346 */
1335 unsigned long shrink_all_memory(unsigned long nr_pages) 1347 unsigned long shrink_all_memory(unsigned long nr_pages)
1336 { 1348 {
1337 unsigned long lru_pages, nr_slab; 1349 unsigned long lru_pages, nr_slab;
1338 unsigned long ret = 0; 1350 unsigned long ret = 0;
1339 int pass; 1351 int pass;
1340 struct reclaim_state reclaim_state; 1352 struct reclaim_state reclaim_state;
1341 struct zone *zone; 1353 struct zone *zone;
1342 struct scan_control sc = { 1354 struct scan_control sc = {
1343 .gfp_mask = GFP_KERNEL, 1355 .gfp_mask = GFP_KERNEL,
1344 .may_swap = 0, 1356 .may_swap = 0,
1345 .swap_cluster_max = nr_pages, 1357 .swap_cluster_max = nr_pages,
1346 .may_writepage = 1, 1358 .may_writepage = 1,
1347 .swappiness = vm_swappiness, 1359 .swappiness = vm_swappiness,
1348 }; 1360 };
1349 1361
1350 current->reclaim_state = &reclaim_state; 1362 current->reclaim_state = &reclaim_state;
1351 1363
1352 lru_pages = 0; 1364 lru_pages = 0;
1353 for_each_zone(zone) 1365 for_each_zone(zone)
1354 lru_pages += zone->nr_active + zone->nr_inactive; 1366 lru_pages += zone->nr_active + zone->nr_inactive;
1355 1367
1356 nr_slab = read_page_state(nr_slab); 1368 nr_slab = read_page_state(nr_slab);
1357 /* If slab caches are huge, it's better to hit them first */ 1369 /* If slab caches are huge, it's better to hit them first */
1358 while (nr_slab >= lru_pages) { 1370 while (nr_slab >= lru_pages) {
1359 reclaim_state.reclaimed_slab = 0; 1371 reclaim_state.reclaimed_slab = 0;
1360 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1372 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1361 if (!reclaim_state.reclaimed_slab) 1373 if (!reclaim_state.reclaimed_slab)
1362 break; 1374 break;
1363 1375
1364 ret += reclaim_state.reclaimed_slab; 1376 ret += reclaim_state.reclaimed_slab;
1365 if (ret >= nr_pages) 1377 if (ret >= nr_pages)
1366 goto out; 1378 goto out;
1367 1379
1368 nr_slab -= reclaim_state.reclaimed_slab; 1380 nr_slab -= reclaim_state.reclaimed_slab;
1369 } 1381 }
1370 1382
1371 /* 1383 /*
1372 * We try to shrink LRUs in 5 passes: 1384 * We try to shrink LRUs in 5 passes:
1373 * 0 = Reclaim from inactive_list only 1385 * 0 = Reclaim from inactive_list only
1374 * 1 = Reclaim from active list but don't reclaim mapped 1386 * 1 = Reclaim from active list but don't reclaim mapped
1375 * 2 = 2nd pass of type 1 1387 * 2 = 2nd pass of type 1
1376 * 3 = Reclaim mapped (normal reclaim) 1388 * 3 = Reclaim mapped (normal reclaim)
1377 * 4 = 2nd pass of type 3 1389 * 4 = 2nd pass of type 3
1378 */ 1390 */
1379 for (pass = 0; pass < 5; pass++) { 1391 for (pass = 0; pass < 5; pass++) {
1380 int prio; 1392 int prio;
1381 1393
1382 /* Needed for shrinking slab caches later on */ 1394 /* Needed for shrinking slab caches later on */
1383 if (!lru_pages) 1395 if (!lru_pages)
1384 for_each_zone(zone) { 1396 for_each_zone(zone) {
1385 lru_pages += zone->nr_active; 1397 lru_pages += zone->nr_active;
1386 lru_pages += zone->nr_inactive; 1398 lru_pages += zone->nr_inactive;
1387 } 1399 }
1388 1400
1389 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1401 /* Force reclaiming mapped pages in the passes #3 and #4 */
1390 if (pass > 2) { 1402 if (pass > 2) {
1391 sc.may_swap = 1; 1403 sc.may_swap = 1;
1392 sc.swappiness = 100; 1404 sc.swappiness = 100;
1393 } 1405 }
1394 1406
1395 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1407 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
1396 unsigned long nr_to_scan = nr_pages - ret; 1408 unsigned long nr_to_scan = nr_pages - ret;
1397 1409
1398 sc.nr_mapped = read_page_state(nr_mapped); 1410 sc.nr_mapped = read_page_state(nr_mapped);
1399 sc.nr_scanned = 0; 1411 sc.nr_scanned = 0;
1400 1412
1401 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1413 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
1402 if (ret >= nr_pages) 1414 if (ret >= nr_pages)
1403 goto out; 1415 goto out;
1404 1416
1405 reclaim_state.reclaimed_slab = 0; 1417 reclaim_state.reclaimed_slab = 0;
1406 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); 1418 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
1407 ret += reclaim_state.reclaimed_slab; 1419 ret += reclaim_state.reclaimed_slab;
1408 if (ret >= nr_pages) 1420 if (ret >= nr_pages)
1409 goto out; 1421 goto out;
1410 1422
1411 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1423 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1412 blk_congestion_wait(WRITE, HZ / 10); 1424 blk_congestion_wait(WRITE, HZ / 10);
1413 } 1425 }
1414 1426
1415 lru_pages = 0; 1427 lru_pages = 0;
1416 } 1428 }
1417 1429
1418 /* 1430 /*
1419 * If ret = 0, we could not shrink LRUs, but there may be something 1431 * If ret = 0, we could not shrink LRUs, but there may be something
1420 * in slab caches 1432 * in slab caches
1421 */ 1433 */
1422 if (!ret) 1434 if (!ret)
1423 do { 1435 do {
1424 reclaim_state.reclaimed_slab = 0; 1436 reclaim_state.reclaimed_slab = 0;
1425 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1437 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1426 ret += reclaim_state.reclaimed_slab; 1438 ret += reclaim_state.reclaimed_slab;
1427 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1439 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1428 1440
1429 out: 1441 out:
1430 current->reclaim_state = NULL; 1442 current->reclaim_state = NULL;
1431 1443
1432 return ret; 1444 return ret;
1433 } 1445 }
1434 #endif 1446 #endif
1435 1447
1436 #ifdef CONFIG_HOTPLUG_CPU 1448 #ifdef CONFIG_HOTPLUG_CPU
1437 /* It's optimal to keep kswapds on the same CPUs as their memory, but 1449 /* It's optimal to keep kswapds on the same CPUs as their memory, but
1438 not required for correctness. So if the last cpu in a node goes 1450 not required for correctness. So if the last cpu in a node goes
1439 away, we get changed to run anywhere: as the first one comes back, 1451 away, we get changed to run anywhere: as the first one comes back,
1440 restore their cpu bindings. */ 1452 restore their cpu bindings. */
1441 static int cpu_callback(struct notifier_block *nfb, 1453 static int cpu_callback(struct notifier_block *nfb,
1442 unsigned long action, void *hcpu) 1454 unsigned long action, void *hcpu)
1443 { 1455 {
1444 pg_data_t *pgdat; 1456 pg_data_t *pgdat;
1445 cpumask_t mask; 1457 cpumask_t mask;
1446 1458
1447 if (action == CPU_ONLINE) { 1459 if (action == CPU_ONLINE) {
1448 for_each_online_pgdat(pgdat) { 1460 for_each_online_pgdat(pgdat) {
1449 mask = node_to_cpumask(pgdat->node_id); 1461 mask = node_to_cpumask(pgdat->node_id);
1450 if (any_online_cpu(mask) != NR_CPUS) 1462 if (any_online_cpu(mask) != NR_CPUS)
1451 /* One of our CPUs online: restore mask */ 1463 /* One of our CPUs online: restore mask */
1452 set_cpus_allowed(pgdat->kswapd, mask); 1464 set_cpus_allowed(pgdat->kswapd, mask);
1453 } 1465 }
1454 } 1466 }
1455 return NOTIFY_OK; 1467 return NOTIFY_OK;
1456 } 1468 }
1457 #endif /* CONFIG_HOTPLUG_CPU */ 1469 #endif /* CONFIG_HOTPLUG_CPU */
1458 1470
1459 static int __init kswapd_init(void) 1471 static int __init kswapd_init(void)
1460 { 1472 {
1461 pg_data_t *pgdat; 1473 pg_data_t *pgdat;
1462 1474
1463 swap_setup(); 1475 swap_setup();
1464 for_each_online_pgdat(pgdat) { 1476 for_each_online_pgdat(pgdat) {
1465 pid_t pid; 1477 pid_t pid;
1466 1478
1467 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); 1479 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1468 BUG_ON(pid < 0); 1480 BUG_ON(pid < 0);
1469 read_lock(&tasklist_lock); 1481 read_lock(&tasklist_lock);
1470 pgdat->kswapd = find_task_by_pid(pid); 1482 pgdat->kswapd = find_task_by_pid(pid);
1471 read_unlock(&tasklist_lock); 1483 read_unlock(&tasklist_lock);
1472 } 1484 }
1473 total_memory = nr_free_pagecache_pages(); 1485 total_memory = nr_free_pagecache_pages();
1474 hotcpu_notifier(cpu_callback, 0); 1486 hotcpu_notifier(cpu_callback, 0);
1475 return 0; 1487 return 0;
1476 } 1488 }
1477 1489
1478 module_init(kswapd_init) 1490 module_init(kswapd_init)
1479 1491
1480 #ifdef CONFIG_NUMA 1492 #ifdef CONFIG_NUMA
1481 /* 1493 /*
1482 * Zone reclaim mode 1494 * Zone reclaim mode
1483 * 1495 *
1484 * If non-zero call zone_reclaim when the number of free pages falls below 1496 * If non-zero call zone_reclaim when the number of free pages falls below
1485 * the watermarks. 1497 * the watermarks.
1486 * 1498 *
1487 * In the future we may add flags to the mode. However, the page allocator 1499 * In the future we may add flags to the mode. However, the page allocator
1488 * should only have to check that zone_reclaim_mode != 0 before calling 1500 * should only have to check that zone_reclaim_mode != 0 before calling
1489 * zone_reclaim(). 1501 * zone_reclaim().
1490 */ 1502 */
1491 int zone_reclaim_mode __read_mostly; 1503 int zone_reclaim_mode __read_mostly;
1492 1504
1493 #define RECLAIM_OFF 0 1505 #define RECLAIM_OFF 0
1494 #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1506 #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1495 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1507 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1496 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1508 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1497 #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ 1509 #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1498 1510
1499 /* 1511 /*
1500 * Mininum time between zone reclaim scans 1512 * Mininum time between zone reclaim scans
1501 */ 1513 */
1502 int zone_reclaim_interval __read_mostly = 30*HZ; 1514 int zone_reclaim_interval __read_mostly = 30*HZ;
1503 1515
1504 /* 1516 /*
1505 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1517 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1506 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1518 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1507 * a zone. 1519 * a zone.
1508 */ 1520 */
1509 #define ZONE_RECLAIM_PRIORITY 4 1521 #define ZONE_RECLAIM_PRIORITY 4
1510 1522
1511 /* 1523 /*
1512 * Try to free up some pages from this zone through reclaim. 1524 * Try to free up some pages from this zone through reclaim.
1513 */ 1525 */
1514 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1526 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1515 { 1527 {
1516 /* Minimum pages needed in order to stay on node */ 1528 /* Minimum pages needed in order to stay on node */
1517 const unsigned long nr_pages = 1 << order; 1529 const unsigned long nr_pages = 1 << order;
1518 struct task_struct *p = current; 1530 struct task_struct *p = current;
1519 struct reclaim_state reclaim_state; 1531 struct reclaim_state reclaim_state;
1520 int priority; 1532 int priority;
1521 unsigned long nr_reclaimed = 0; 1533 unsigned long nr_reclaimed = 0;
1522 struct scan_control sc = { 1534 struct scan_control sc = {
1523 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1535 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1524 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1536 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1525 .nr_mapped = read_page_state(nr_mapped), 1537 .nr_mapped = read_page_state(nr_mapped),
1526 .swap_cluster_max = max_t(unsigned long, nr_pages, 1538 .swap_cluster_max = max_t(unsigned long, nr_pages,
1527 SWAP_CLUSTER_MAX), 1539 SWAP_CLUSTER_MAX),
1528 .gfp_mask = gfp_mask, 1540 .gfp_mask = gfp_mask,
1529 .swappiness = vm_swappiness, 1541 .swappiness = vm_swappiness,
1530 }; 1542 };
1531 1543
1532 disable_swap_token(); 1544 disable_swap_token();
1533 cond_resched(); 1545 cond_resched();
1534 /* 1546 /*
1535 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1547 * We need to be able to allocate from the reserves for RECLAIM_SWAP
1536 * and we also need to be able to write out pages for RECLAIM_WRITE 1548 * and we also need to be able to write out pages for RECLAIM_WRITE
1537 * and RECLAIM_SWAP. 1549 * and RECLAIM_SWAP.
1538 */ 1550 */
1539 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 1551 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
1540 reclaim_state.reclaimed_slab = 0; 1552 reclaim_state.reclaimed_slab = 0;
1541 p->reclaim_state = &reclaim_state; 1553 p->reclaim_state = &reclaim_state;
1542 1554
1543 /* 1555 /*
1544 * Free memory by calling shrink zone with increasing priorities 1556 * Free memory by calling shrink zone with increasing priorities
1545 * until we have enough memory freed. 1557 * until we have enough memory freed.
1546 */ 1558 */
1547 priority = ZONE_RECLAIM_PRIORITY; 1559 priority = ZONE_RECLAIM_PRIORITY;
1548 do { 1560 do {
1549 nr_reclaimed += shrink_zone(priority, zone, &sc); 1561 nr_reclaimed += shrink_zone(priority, zone, &sc);
1550 priority--; 1562 priority--;
1551 } while (priority >= 0 && nr_reclaimed < nr_pages); 1563 } while (priority >= 0 && nr_reclaimed < nr_pages);
1552 1564
1553 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1565 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1554 /* 1566 /*
1555 * shrink_slab() does not currently allow us to determine how 1567 * shrink_slab() does not currently allow us to determine how
1556 * many pages were freed in this zone. So we just shake the slab 1568 * many pages were freed in this zone. So we just shake the slab
1557 * a bit and then go off node for this particular allocation 1569 * a bit and then go off node for this particular allocation
1558 * despite possibly having freed enough memory to allocate in 1570 * despite possibly having freed enough memory to allocate in
1559 * this zone. If we freed local memory then the next 1571 * this zone. If we freed local memory then the next
1560 * allocations will be local again. 1572 * allocations will be local again.
1561 * 1573 *
1562 * shrink_slab will free memory on all zones and may take 1574 * shrink_slab will free memory on all zones and may take
1563 * a long time. 1575 * a long time.
1564 */ 1576 */
1565 shrink_slab(sc.nr_scanned, gfp_mask, order); 1577 shrink_slab(sc.nr_scanned, gfp_mask, order);
1566 } 1578 }
1567 1579
1568 p->reclaim_state = NULL; 1580 p->reclaim_state = NULL;
1569 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1581 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1570 1582
1571 if (nr_reclaimed == 0) { 1583 if (nr_reclaimed == 0) {
1572 /* 1584 /*
1573 * We were unable to reclaim enough pages to stay on node. We 1585 * We were unable to reclaim enough pages to stay on node. We
1574 * now allow off node accesses for a certain time period before 1586 * now allow off node accesses for a certain time period before
1575 * trying again to reclaim pages from the local zone. 1587 * trying again to reclaim pages from the local zone.
1576 */ 1588 */
1577 zone->last_unsuccessful_zone_reclaim = jiffies; 1589 zone->last_unsuccessful_zone_reclaim = jiffies;
1578 } 1590 }
1579 1591
1580 return nr_reclaimed >= nr_pages; 1592 return nr_reclaimed >= nr_pages;
1581 } 1593 }
1582 1594
1583 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1595 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1584 { 1596 {
1585 cpumask_t mask; 1597 cpumask_t mask;
1586 int node_id; 1598 int node_id;
1587 1599
1588 /* 1600 /*
1589 * Do not reclaim if there was a recent unsuccessful attempt at zone 1601 * Do not reclaim if there was a recent unsuccessful attempt at zone
1590 * reclaim. In that case we let allocations go off node for the 1602 * reclaim. In that case we let allocations go off node for the
1591 * zone_reclaim_interval. Otherwise we would scan for each off-node 1603 * zone_reclaim_interval. Otherwise we would scan for each off-node
1592 * page allocation. 1604 * page allocation.
1593 */ 1605 */
1594 if (time_before(jiffies, 1606 if (time_before(jiffies,
1595 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1607 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1596 return 0; 1608 return 0;
1597 1609
1598 /* 1610 /*
1599 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1611 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
1600 * not have reclaimable pages and if we should not delay the allocation 1612 * not have reclaimable pages and if we should not delay the allocation
1601 * then do not scan. 1613 * then do not scan.
1602 */ 1614 */
1603 if (!(gfp_mask & __GFP_WAIT) || 1615 if (!(gfp_mask & __GFP_WAIT) ||
1604 zone->all_unreclaimable || 1616 zone->all_unreclaimable ||
1605 atomic_read(&zone->reclaim_in_progress) > 0 || 1617 atomic_read(&zone->reclaim_in_progress) > 0 ||
1606 (current->flags & PF_MEMALLOC)) 1618 (current->flags & PF_MEMALLOC))
1607 return 0; 1619 return 0;
1608 1620
1609 /* 1621 /*
1610 * Only run zone reclaim on the local zone or on zones that do not 1622 * Only run zone reclaim on the local zone or on zones that do not
1611 * have associated processors. This will favor the local processor 1623 * have associated processors. This will favor the local processor
1612 * over remote processors and spread off node memory allocations 1624 * over remote processors and spread off node memory allocations
1613 * as wide as possible. 1625 * as wide as possible.
1614 */ 1626 */
1615 node_id = zone->zone_pgdat->node_id; 1627 node_id = zone->zone_pgdat->node_id;
1616 mask = node_to_cpumask(node_id); 1628 mask = node_to_cpumask(node_id);
1617 if (!cpus_empty(mask) && node_id != numa_node_id()) 1629 if (!cpus_empty(mask) && node_id != numa_node_id())
1618 return 0; 1630 return 0;
1619 return __zone_reclaim(zone, gfp_mask, order); 1631 return __zone_reclaim(zone, gfp_mask, order);
1620 } 1632 }
1621 #endif 1633 #endif
1622 1634