Commit 04e62a29bf157ce1edd168f2b71b533c80d13628
Committed by
Linus Torvalds
1 parent
442c9137de
[PATCH] More page migration: use migration entries for file pages
This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 4 changed files with 124 additions and 43 deletions Inline Diff
include/linux/swap.h
1 | #ifndef _LINUX_SWAP_H | 1 | #ifndef _LINUX_SWAP_H |
2 | #define _LINUX_SWAP_H | 2 | #define _LINUX_SWAP_H |
3 | 3 | ||
4 | #include <linux/spinlock.h> | 4 | #include <linux/spinlock.h> |
5 | #include <linux/linkage.h> | 5 | #include <linux/linkage.h> |
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | 9 | ||
10 | #include <asm/atomic.h> | 10 | #include <asm/atomic.h> |
11 | #include <asm/page.h> | 11 | #include <asm/page.h> |
12 | 12 | ||
13 | #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ | 13 | #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ |
14 | #define SWAP_FLAG_PRIO_MASK 0x7fff | 14 | #define SWAP_FLAG_PRIO_MASK 0x7fff |
15 | #define SWAP_FLAG_PRIO_SHIFT 0 | 15 | #define SWAP_FLAG_PRIO_SHIFT 0 |
16 | 16 | ||
17 | static inline int current_is_kswapd(void) | 17 | static inline int current_is_kswapd(void) |
18 | { | 18 | { |
19 | return current->flags & PF_KSWAPD; | 19 | return current->flags & PF_KSWAPD; |
20 | } | 20 | } |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * MAX_SWAPFILES defines the maximum number of swaptypes: things which can | 23 | * MAX_SWAPFILES defines the maximum number of swaptypes: things which can |
24 | * be swapped to. The swap type and the offset into that swap type are | 24 | * be swapped to. The swap type and the offset into that swap type are |
25 | * encoded into pte's and into pgoff_t's in the swapcache. Using five bits | 25 | * encoded into pte's and into pgoff_t's in the swapcache. Using five bits |
26 | * for the type means that the maximum number of swapcache pages is 27 bits | 26 | * for the type means that the maximum number of swapcache pages is 27 bits |
27 | * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs | 27 | * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs |
28 | * the type/offset into the pte as 5/27 as well. | 28 | * the type/offset into the pte as 5/27 as well. |
29 | */ | 29 | */ |
30 | #define MAX_SWAPFILES_SHIFT 5 | 30 | #define MAX_SWAPFILES_SHIFT 5 |
31 | #ifndef CONFIG_MIGRATION | 31 | #ifndef CONFIG_MIGRATION |
32 | #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) | 32 | #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) |
33 | #else | 33 | #else |
34 | /* Use last two entries for page migration swap entries */ | 34 | /* Use last two entries for page migration swap entries */ |
35 | #define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) | 35 | #define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) |
36 | #define SWP_MIGRATION_READ MAX_SWAPFILES | 36 | #define SWP_MIGRATION_READ MAX_SWAPFILES |
37 | #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) | 37 | #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) |
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | /* | 40 | /* |
41 | * Magic header for a swap area. The first part of the union is | 41 | * Magic header for a swap area. The first part of the union is |
42 | * what the swap magic looks like for the old (limited to 128MB) | 42 | * what the swap magic looks like for the old (limited to 128MB) |
43 | * swap area format, the second part of the union adds - in the | 43 | * swap area format, the second part of the union adds - in the |
44 | * old reserved area - some extra information. Note that the first | 44 | * old reserved area - some extra information. Note that the first |
45 | * kilobyte is reserved for boot loader or disk label stuff... | 45 | * kilobyte is reserved for boot loader or disk label stuff... |
46 | * | 46 | * |
47 | * Having the magic at the end of the PAGE_SIZE makes detecting swap | 47 | * Having the magic at the end of the PAGE_SIZE makes detecting swap |
48 | * areas somewhat tricky on machines that support multiple page sizes. | 48 | * areas somewhat tricky on machines that support multiple page sizes. |
49 | * For 2.5 we'll probably want to move the magic to just beyond the | 49 | * For 2.5 we'll probably want to move the magic to just beyond the |
50 | * bootbits... | 50 | * bootbits... |
51 | */ | 51 | */ |
52 | union swap_header { | 52 | union swap_header { |
53 | struct { | 53 | struct { |
54 | char reserved[PAGE_SIZE - 10]; | 54 | char reserved[PAGE_SIZE - 10]; |
55 | char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ | 55 | char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ |
56 | } magic; | 56 | } magic; |
57 | struct { | 57 | struct { |
58 | char bootbits[1024]; /* Space for disklabel etc. */ | 58 | char bootbits[1024]; /* Space for disklabel etc. */ |
59 | __u32 version; | 59 | __u32 version; |
60 | __u32 last_page; | 60 | __u32 last_page; |
61 | __u32 nr_badpages; | 61 | __u32 nr_badpages; |
62 | unsigned char sws_uuid[16]; | 62 | unsigned char sws_uuid[16]; |
63 | unsigned char sws_volume[16]; | 63 | unsigned char sws_volume[16]; |
64 | __u32 padding[117]; | 64 | __u32 padding[117]; |
65 | __u32 badpages[1]; | 65 | __u32 badpages[1]; |
66 | } info; | 66 | } info; |
67 | }; | 67 | }; |
68 | 68 | ||
69 | /* A swap entry has to fit into a "unsigned long", as | 69 | /* A swap entry has to fit into a "unsigned long", as |
70 | * the entry is hidden in the "index" field of the | 70 | * the entry is hidden in the "index" field of the |
71 | * swapper address space. | 71 | * swapper address space. |
72 | */ | 72 | */ |
73 | typedef struct { | 73 | typedef struct { |
74 | unsigned long val; | 74 | unsigned long val; |
75 | } swp_entry_t; | 75 | } swp_entry_t; |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * current->reclaim_state points to one of these when a task is running | 78 | * current->reclaim_state points to one of these when a task is running |
79 | * memory reclaim | 79 | * memory reclaim |
80 | */ | 80 | */ |
81 | struct reclaim_state { | 81 | struct reclaim_state { |
82 | unsigned long reclaimed_slab; | 82 | unsigned long reclaimed_slab; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | #ifdef __KERNEL__ | 85 | #ifdef __KERNEL__ |
86 | 86 | ||
87 | struct address_space; | 87 | struct address_space; |
88 | struct sysinfo; | 88 | struct sysinfo; |
89 | struct writeback_control; | 89 | struct writeback_control; |
90 | struct zone; | 90 | struct zone; |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of | 93 | * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of |
94 | * disk blocks. A list of swap extents maps the entire swapfile. (Where the | 94 | * disk blocks. A list of swap extents maps the entire swapfile. (Where the |
95 | * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart | 95 | * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart |
96 | * from setup, they're handled identically. | 96 | * from setup, they're handled identically. |
97 | * | 97 | * |
98 | * We always assume that blocks are of size PAGE_SIZE. | 98 | * We always assume that blocks are of size PAGE_SIZE. |
99 | */ | 99 | */ |
100 | struct swap_extent { | 100 | struct swap_extent { |
101 | struct list_head list; | 101 | struct list_head list; |
102 | pgoff_t start_page; | 102 | pgoff_t start_page; |
103 | pgoff_t nr_pages; | 103 | pgoff_t nr_pages; |
104 | sector_t start_block; | 104 | sector_t start_block; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | /* | 107 | /* |
108 | * Max bad pages in the new format.. | 108 | * Max bad pages in the new format.. |
109 | */ | 109 | */ |
110 | #define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) | 110 | #define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) |
111 | #define MAX_SWAP_BADPAGES \ | 111 | #define MAX_SWAP_BADPAGES \ |
112 | ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) | 112 | ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) |
113 | 113 | ||
114 | enum { | 114 | enum { |
115 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ | 115 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ |
116 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ | 116 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ |
117 | SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), | 117 | SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), |
118 | /* add others here before... */ | 118 | /* add others here before... */ |
119 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 119 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
120 | }; | 120 | }; |
121 | 121 | ||
122 | #define SWAP_CLUSTER_MAX 32 | 122 | #define SWAP_CLUSTER_MAX 32 |
123 | 123 | ||
124 | #define SWAP_MAP_MAX 0x7fff | 124 | #define SWAP_MAP_MAX 0x7fff |
125 | #define SWAP_MAP_BAD 0x8000 | 125 | #define SWAP_MAP_BAD 0x8000 |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * The in-memory structure used to track swap areas. | 128 | * The in-memory structure used to track swap areas. |
129 | */ | 129 | */ |
130 | struct swap_info_struct { | 130 | struct swap_info_struct { |
131 | unsigned int flags; | 131 | unsigned int flags; |
132 | int prio; /* swap priority */ | 132 | int prio; /* swap priority */ |
133 | struct file *swap_file; | 133 | struct file *swap_file; |
134 | struct block_device *bdev; | 134 | struct block_device *bdev; |
135 | struct list_head extent_list; | 135 | struct list_head extent_list; |
136 | struct swap_extent *curr_swap_extent; | 136 | struct swap_extent *curr_swap_extent; |
137 | unsigned old_block_size; | 137 | unsigned old_block_size; |
138 | unsigned short * swap_map; | 138 | unsigned short * swap_map; |
139 | unsigned int lowest_bit; | 139 | unsigned int lowest_bit; |
140 | unsigned int highest_bit; | 140 | unsigned int highest_bit; |
141 | unsigned int cluster_next; | 141 | unsigned int cluster_next; |
142 | unsigned int cluster_nr; | 142 | unsigned int cluster_nr; |
143 | unsigned int pages; | 143 | unsigned int pages; |
144 | unsigned int max; | 144 | unsigned int max; |
145 | unsigned int inuse_pages; | 145 | unsigned int inuse_pages; |
146 | int next; /* next entry on swap list */ | 146 | int next; /* next entry on swap list */ |
147 | }; | 147 | }; |
148 | 148 | ||
149 | struct swap_list_t { | 149 | struct swap_list_t { |
150 | int head; /* head of priority-ordered swapfile list */ | 150 | int head; /* head of priority-ordered swapfile list */ |
151 | int next; /* swapfile to be used next */ | 151 | int next; /* swapfile to be used next */ |
152 | }; | 152 | }; |
153 | 153 | ||
154 | /* Swap 50% full? Release swapcache more aggressively.. */ | 154 | /* Swap 50% full? Release swapcache more aggressively.. */ |
155 | #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) | 155 | #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) |
156 | 156 | ||
157 | /* linux/mm/oom_kill.c */ | 157 | /* linux/mm/oom_kill.c */ |
158 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); | 158 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); |
159 | 159 | ||
160 | /* linux/mm/memory.c */ | 160 | /* linux/mm/memory.c */ |
161 | extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); | 161 | extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); |
162 | 162 | ||
163 | /* linux/mm/page_alloc.c */ | 163 | /* linux/mm/page_alloc.c */ |
164 | extern unsigned long totalram_pages; | 164 | extern unsigned long totalram_pages; |
165 | extern unsigned long totalhigh_pages; | 165 | extern unsigned long totalhigh_pages; |
166 | extern unsigned long totalreserve_pages; | 166 | extern unsigned long totalreserve_pages; |
167 | extern long nr_swap_pages; | 167 | extern long nr_swap_pages; |
168 | extern unsigned int nr_free_pages(void); | 168 | extern unsigned int nr_free_pages(void); |
169 | extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); | 169 | extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); |
170 | extern unsigned int nr_free_buffer_pages(void); | 170 | extern unsigned int nr_free_buffer_pages(void); |
171 | extern unsigned int nr_free_pagecache_pages(void); | 171 | extern unsigned int nr_free_pagecache_pages(void); |
172 | 172 | ||
173 | /* linux/mm/swap.c */ | 173 | /* linux/mm/swap.c */ |
174 | extern void FASTCALL(lru_cache_add(struct page *)); | 174 | extern void FASTCALL(lru_cache_add(struct page *)); |
175 | extern void FASTCALL(lru_cache_add_active(struct page *)); | 175 | extern void FASTCALL(lru_cache_add_active(struct page *)); |
176 | extern void FASTCALL(activate_page(struct page *)); | 176 | extern void FASTCALL(activate_page(struct page *)); |
177 | extern void FASTCALL(mark_page_accessed(struct page *)); | 177 | extern void FASTCALL(mark_page_accessed(struct page *)); |
178 | extern void lru_add_drain(void); | 178 | extern void lru_add_drain(void); |
179 | extern int lru_add_drain_all(void); | 179 | extern int lru_add_drain_all(void); |
180 | extern int rotate_reclaimable_page(struct page *page); | 180 | extern int rotate_reclaimable_page(struct page *page); |
181 | extern void swap_setup(void); | 181 | extern void swap_setup(void); |
182 | 182 | ||
183 | /* linux/mm/vmscan.c */ | 183 | /* linux/mm/vmscan.c */ |
184 | extern unsigned long try_to_free_pages(struct zone **, gfp_t); | 184 | extern unsigned long try_to_free_pages(struct zone **, gfp_t); |
185 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 185 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
186 | extern int vm_swappiness; | 186 | extern int vm_swappiness; |
187 | extern int remove_mapping(struct address_space *mapping, struct page *page); | 187 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
188 | 188 | ||
189 | /* possible outcome of pageout() */ | ||
190 | typedef enum { | ||
191 | /* failed to write page out, page is locked */ | ||
192 | PAGE_KEEP, | ||
193 | /* move page to the active list, page is locked */ | ||
194 | PAGE_ACTIVATE, | ||
195 | /* page has been sent to the disk successfully, page is unlocked */ | ||
196 | PAGE_SUCCESS, | ||
197 | /* page is clean and locked */ | ||
198 | PAGE_CLEAN, | ||
199 | } pageout_t; | ||
200 | |||
201 | extern pageout_t pageout(struct page *page, struct address_space *mapping); | ||
202 | |||
203 | #ifdef CONFIG_NUMA | 189 | #ifdef CONFIG_NUMA |
204 | extern int zone_reclaim_mode; | 190 | extern int zone_reclaim_mode; |
205 | extern int zone_reclaim_interval; | 191 | extern int zone_reclaim_interval; |
206 | extern int zone_reclaim(struct zone *, gfp_t, unsigned int); | 192 | extern int zone_reclaim(struct zone *, gfp_t, unsigned int); |
207 | #else | 193 | #else |
208 | #define zone_reclaim_mode 0 | 194 | #define zone_reclaim_mode 0 |
209 | static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) | 195 | static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) |
210 | { | 196 | { |
211 | return 0; | 197 | return 0; |
212 | } | 198 | } |
213 | #endif | 199 | #endif |
214 | 200 | ||
215 | #ifdef CONFIG_MMU | 201 | #ifdef CONFIG_MMU |
216 | /* linux/mm/shmem.c */ | 202 | /* linux/mm/shmem.c */ |
217 | extern int shmem_unuse(swp_entry_t entry, struct page *page); | 203 | extern int shmem_unuse(swp_entry_t entry, struct page *page); |
218 | #endif /* CONFIG_MMU */ | 204 | #endif /* CONFIG_MMU */ |
219 | 205 | ||
220 | extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); | 206 | extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); |
221 | 207 | ||
222 | #ifdef CONFIG_SWAP | 208 | #ifdef CONFIG_SWAP |
223 | /* linux/mm/page_io.c */ | 209 | /* linux/mm/page_io.c */ |
224 | extern int swap_readpage(struct file *, struct page *); | 210 | extern int swap_readpage(struct file *, struct page *); |
225 | extern int swap_writepage(struct page *page, struct writeback_control *wbc); | 211 | extern int swap_writepage(struct page *page, struct writeback_control *wbc); |
226 | extern int rw_swap_page_sync(int, swp_entry_t, struct page *); | 212 | extern int rw_swap_page_sync(int, swp_entry_t, struct page *); |
227 | 213 | ||
228 | /* linux/mm/swap_state.c */ | 214 | /* linux/mm/swap_state.c */ |
229 | extern struct address_space swapper_space; | 215 | extern struct address_space swapper_space; |
230 | #define total_swapcache_pages swapper_space.nrpages | 216 | #define total_swapcache_pages swapper_space.nrpages |
231 | extern void show_swap_cache_info(void); | 217 | extern void show_swap_cache_info(void); |
232 | extern int add_to_swap(struct page *, gfp_t); | 218 | extern int add_to_swap(struct page *, gfp_t); |
233 | extern void __delete_from_swap_cache(struct page *); | 219 | extern void __delete_from_swap_cache(struct page *); |
234 | extern void delete_from_swap_cache(struct page *); | 220 | extern void delete_from_swap_cache(struct page *); |
235 | extern int move_to_swap_cache(struct page *, swp_entry_t); | 221 | extern int move_to_swap_cache(struct page *, swp_entry_t); |
236 | extern int move_from_swap_cache(struct page *, unsigned long, | 222 | extern int move_from_swap_cache(struct page *, unsigned long, |
237 | struct address_space *); | 223 | struct address_space *); |
238 | extern void free_page_and_swap_cache(struct page *); | 224 | extern void free_page_and_swap_cache(struct page *); |
239 | extern void free_pages_and_swap_cache(struct page **, int); | 225 | extern void free_pages_and_swap_cache(struct page **, int); |
240 | extern struct page * lookup_swap_cache(swp_entry_t); | 226 | extern struct page * lookup_swap_cache(swp_entry_t); |
241 | extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, | 227 | extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, |
242 | unsigned long addr); | 228 | unsigned long addr); |
243 | /* linux/mm/swapfile.c */ | 229 | /* linux/mm/swapfile.c */ |
244 | extern long total_swap_pages; | 230 | extern long total_swap_pages; |
245 | extern unsigned int nr_swapfiles; | 231 | extern unsigned int nr_swapfiles; |
246 | extern void si_swapinfo(struct sysinfo *); | 232 | extern void si_swapinfo(struct sysinfo *); |
247 | extern swp_entry_t get_swap_page(void); | 233 | extern swp_entry_t get_swap_page(void); |
248 | extern swp_entry_t get_swap_page_of_type(int); | 234 | extern swp_entry_t get_swap_page_of_type(int); |
249 | extern int swap_duplicate(swp_entry_t); | 235 | extern int swap_duplicate(swp_entry_t); |
250 | extern int valid_swaphandles(swp_entry_t, unsigned long *); | 236 | extern int valid_swaphandles(swp_entry_t, unsigned long *); |
251 | extern void swap_free(swp_entry_t); | 237 | extern void swap_free(swp_entry_t); |
252 | extern void free_swap_and_cache(swp_entry_t); | 238 | extern void free_swap_and_cache(swp_entry_t); |
253 | extern int swap_type_of(dev_t); | 239 | extern int swap_type_of(dev_t); |
254 | extern unsigned int count_swap_pages(int, int); | 240 | extern unsigned int count_swap_pages(int, int); |
255 | extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); | 241 | extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); |
256 | extern struct swap_info_struct *get_swap_info_struct(unsigned); | 242 | extern struct swap_info_struct *get_swap_info_struct(unsigned); |
257 | extern int can_share_swap_page(struct page *); | 243 | extern int can_share_swap_page(struct page *); |
258 | extern int remove_exclusive_swap_page(struct page *); | 244 | extern int remove_exclusive_swap_page(struct page *); |
259 | struct backing_dev_info; | 245 | struct backing_dev_info; |
260 | 246 | ||
261 | extern spinlock_t swap_lock; | 247 | extern spinlock_t swap_lock; |
262 | extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); | ||
263 | 248 | ||
264 | /* linux/mm/thrash.c */ | 249 | /* linux/mm/thrash.c */ |
265 | extern struct mm_struct * swap_token_mm; | 250 | extern struct mm_struct * swap_token_mm; |
266 | extern unsigned long swap_token_default_timeout; | 251 | extern unsigned long swap_token_default_timeout; |
267 | extern void grab_swap_token(void); | 252 | extern void grab_swap_token(void); |
268 | extern void __put_swap_token(struct mm_struct *); | 253 | extern void __put_swap_token(struct mm_struct *); |
269 | 254 | ||
270 | static inline int has_swap_token(struct mm_struct *mm) | 255 | static inline int has_swap_token(struct mm_struct *mm) |
271 | { | 256 | { |
272 | return (mm == swap_token_mm); | 257 | return (mm == swap_token_mm); |
273 | } | 258 | } |
274 | 259 | ||
275 | static inline void put_swap_token(struct mm_struct *mm) | 260 | static inline void put_swap_token(struct mm_struct *mm) |
276 | { | 261 | { |
277 | if (has_swap_token(mm)) | 262 | if (has_swap_token(mm)) |
278 | __put_swap_token(mm); | 263 | __put_swap_token(mm); |
279 | } | 264 | } |
280 | 265 | ||
281 | static inline void disable_swap_token(void) | 266 | static inline void disable_swap_token(void) |
282 | { | 267 | { |
283 | put_swap_token(swap_token_mm); | 268 | put_swap_token(swap_token_mm); |
284 | } | 269 | } |
285 | 270 | ||
286 | #else /* CONFIG_SWAP */ | 271 | #else /* CONFIG_SWAP */ |
287 | 272 | ||
288 | #define total_swap_pages 0 | 273 | #define total_swap_pages 0 |
289 | #define total_swapcache_pages 0UL | 274 | #define total_swapcache_pages 0UL |
290 | 275 | ||
291 | #define si_swapinfo(val) \ | 276 | #define si_swapinfo(val) \ |
292 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) | 277 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) |
293 | /* only sparc can not include linux/pagemap.h in this file | 278 | /* only sparc can not include linux/pagemap.h in this file |
294 | * so leave page_cache_release and release_pages undeclared... */ | 279 | * so leave page_cache_release and release_pages undeclared... */ |
295 | #define free_page_and_swap_cache(page) \ | 280 | #define free_page_and_swap_cache(page) \ |
296 | page_cache_release(page) | 281 | page_cache_release(page) |
297 | #define free_pages_and_swap_cache(pages, nr) \ | 282 | #define free_pages_and_swap_cache(pages, nr) \ |
298 | release_pages((pages), (nr), 0); | 283 | release_pages((pages), (nr), 0); |
299 | 284 | ||
300 | #define show_swap_cache_info() /*NOTHING*/ | 285 | #define show_swap_cache_info() /*NOTHING*/ |
301 | #define free_swap_and_cache(swp) /*NOTHING*/ | 286 | #define free_swap_and_cache(swp) /*NOTHING*/ |
302 | #define swap_duplicate(swp) /*NOTHING*/ | 287 | #define swap_duplicate(swp) /*NOTHING*/ |
303 | #define swap_free(swp) /*NOTHING*/ | 288 | #define swap_free(swp) /*NOTHING*/ |
304 | #define read_swap_cache_async(swp,vma,addr) NULL | 289 | #define read_swap_cache_async(swp,vma,addr) NULL |
305 | #define lookup_swap_cache(swp) NULL | 290 | #define lookup_swap_cache(swp) NULL |
306 | #define valid_swaphandles(swp, off) 0 | 291 | #define valid_swaphandles(swp, off) 0 |
307 | #define can_share_swap_page(p) (page_mapcount(p) == 1) | 292 | #define can_share_swap_page(p) (page_mapcount(p) == 1) |
308 | #define move_to_swap_cache(p, swp) 1 | 293 | #define move_to_swap_cache(p, swp) 1 |
309 | #define move_from_swap_cache(p, i, m) 1 | 294 | #define move_from_swap_cache(p, i, m) 1 |
310 | #define __delete_from_swap_cache(p) /*NOTHING*/ | 295 | #define __delete_from_swap_cache(p) /*NOTHING*/ |
311 | #define delete_from_swap_cache(p) /*NOTHING*/ | 296 | #define delete_from_swap_cache(p) /*NOTHING*/ |
312 | #define swap_token_default_timeout 0 | 297 | #define swap_token_default_timeout 0 |
313 | 298 | ||
314 | static inline int remove_exclusive_swap_page(struct page *p) | 299 | static inline int remove_exclusive_swap_page(struct page *p) |
315 | { | 300 | { |
316 | return 0; | 301 | return 0; |
317 | } | 302 | } |
318 | 303 | ||
319 | static inline swp_entry_t get_swap_page(void) | 304 | static inline swp_entry_t get_swap_page(void) |
320 | { | 305 | { |
321 | swp_entry_t entry; | 306 | swp_entry_t entry; |
322 | entry.val = 0; | 307 | entry.val = 0; |
323 | return entry; | 308 | return entry; |
324 | } | 309 | } |
325 | 310 | ||
326 | /* linux/mm/thrash.c */ | 311 | /* linux/mm/thrash.c */ |
327 | #define put_swap_token(x) do { } while(0) | 312 | #define put_swap_token(x) do { } while(0) |
328 | #define grab_swap_token() do { } while(0) | 313 | #define grab_swap_token() do { } while(0) |
329 | #define has_swap_token(x) 0 | 314 | #define has_swap_token(x) 0 |
330 | #define disable_swap_token() do { } while(0) | 315 | #define disable_swap_token() do { } while(0) |
331 | 316 | ||
332 | #endif /* CONFIG_SWAP */ | 317 | #endif /* CONFIG_SWAP */ |
333 | #endif /* __KERNEL__*/ | 318 | #endif /* __KERNEL__*/ |
334 | #endif /* _LINUX_SWAP_H */ | 319 | #endif /* _LINUX_SWAP_H */ |
335 | 320 |
mm/migrate.c
1 | /* | 1 | /* |
2 | * Memory Migration functionality - linux/mm/migration.c | 2 | * Memory Migration functionality - linux/mm/migration.c |
3 | * | 3 | * |
4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter | 4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter |
5 | * | 5 | * |
6 | * Page migration was first developed in the context of the memory hotplug | 6 | * Page migration was first developed in the context of the memory hotplug |
7 | * project. The main authors of the migration code are: | 7 | * project. The main authors of the migration code are: |
8 | * | 8 | * |
9 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | 9 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> |
10 | * Hirokazu Takahashi <taka@valinux.co.jp> | 10 | * Hirokazu Takahashi <taka@valinux.co.jp> |
11 | * Dave Hansen <haveblue@us.ibm.com> | 11 | * Dave Hansen <haveblue@us.ibm.com> |
12 | * Christoph Lameter <clameter@sgi.com> | 12 | * Christoph Lameter <clameter@sgi.com> |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/migrate.h> | 15 | #include <linux/migrate.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/swapops.h> | 18 | #include <linux/swapops.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
22 | #include <linux/pagevec.h> | 22 | #include <linux/pagevec.h> |
23 | #include <linux/rmap.h> | 23 | #include <linux/rmap.h> |
24 | #include <linux/topology.h> | 24 | #include <linux/topology.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
27 | #include <linux/writeback.h> | ||
27 | 28 | ||
28 | #include "internal.h" | 29 | #include "internal.h" |
29 | 30 | ||
30 | /* The maximum number of pages to take off the LRU for migration */ | 31 | /* The maximum number of pages to take off the LRU for migration */ |
31 | #define MIGRATE_CHUNK_SIZE 256 | 32 | #define MIGRATE_CHUNK_SIZE 256 |
32 | 33 | ||
33 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 34 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
34 | 35 | ||
35 | /* | 36 | /* |
36 | * Isolate one page from the LRU lists. If successful put it onto | 37 | * Isolate one page from the LRU lists. If successful put it onto |
37 | * the indicated list with elevated page count. | 38 | * the indicated list with elevated page count. |
38 | * | 39 | * |
39 | * Result: | 40 | * Result: |
40 | * -EBUSY: page not on LRU list | 41 | * -EBUSY: page not on LRU list |
41 | * 0: page removed from LRU list and added to the specified list. | 42 | * 0: page removed from LRU list and added to the specified list. |
42 | */ | 43 | */ |
43 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | 44 | int isolate_lru_page(struct page *page, struct list_head *pagelist) |
44 | { | 45 | { |
45 | int ret = -EBUSY; | 46 | int ret = -EBUSY; |
46 | 47 | ||
47 | if (PageLRU(page)) { | 48 | if (PageLRU(page)) { |
48 | struct zone *zone = page_zone(page); | 49 | struct zone *zone = page_zone(page); |
49 | 50 | ||
50 | spin_lock_irq(&zone->lru_lock); | 51 | spin_lock_irq(&zone->lru_lock); |
51 | if (PageLRU(page)) { | 52 | if (PageLRU(page)) { |
52 | ret = 0; | 53 | ret = 0; |
53 | get_page(page); | 54 | get_page(page); |
54 | ClearPageLRU(page); | 55 | ClearPageLRU(page); |
55 | if (PageActive(page)) | 56 | if (PageActive(page)) |
56 | del_page_from_active_list(zone, page); | 57 | del_page_from_active_list(zone, page); |
57 | else | 58 | else |
58 | del_page_from_inactive_list(zone, page); | 59 | del_page_from_inactive_list(zone, page); |
59 | list_add_tail(&page->lru, pagelist); | 60 | list_add_tail(&page->lru, pagelist); |
60 | } | 61 | } |
61 | spin_unlock_irq(&zone->lru_lock); | 62 | spin_unlock_irq(&zone->lru_lock); |
62 | } | 63 | } |
63 | return ret; | 64 | return ret; |
64 | } | 65 | } |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * migrate_prep() needs to be called after we have compiled the list of pages | 68 | * migrate_prep() needs to be called after we have compiled the list of pages |
68 | * to be migrated using isolate_lru_page() but before we begin a series of calls | 69 | * to be migrated using isolate_lru_page() but before we begin a series of calls |
69 | * to migrate_pages(). | 70 | * to migrate_pages(). |
70 | */ | 71 | */ |
71 | int migrate_prep(void) | 72 | int migrate_prep(void) |
72 | { | 73 | { |
73 | /* | 74 | /* |
74 | * Clear the LRU lists so pages can be isolated. | 75 | * Clear the LRU lists so pages can be isolated. |
75 | * Note that pages may be moved off the LRU after we have | 76 | * Note that pages may be moved off the LRU after we have |
76 | * drained them. Those pages will fail to migrate like other | 77 | * drained them. Those pages will fail to migrate like other |
77 | * pages that may be busy. | 78 | * pages that may be busy. |
78 | */ | 79 | */ |
79 | lru_add_drain_all(); | 80 | lru_add_drain_all(); |
80 | 81 | ||
81 | return 0; | 82 | return 0; |
82 | } | 83 | } |
83 | 84 | ||
84 | static inline void move_to_lru(struct page *page) | 85 | static inline void move_to_lru(struct page *page) |
85 | { | 86 | { |
86 | list_del(&page->lru); | 87 | list_del(&page->lru); |
87 | if (PageActive(page)) { | 88 | if (PageActive(page)) { |
88 | /* | 89 | /* |
89 | * lru_cache_add_active checks that | 90 | * lru_cache_add_active checks that |
90 | * the PG_active bit is off. | 91 | * the PG_active bit is off. |
91 | */ | 92 | */ |
92 | ClearPageActive(page); | 93 | ClearPageActive(page); |
93 | lru_cache_add_active(page); | 94 | lru_cache_add_active(page); |
94 | } else { | 95 | } else { |
95 | lru_cache_add(page); | 96 | lru_cache_add(page); |
96 | } | 97 | } |
97 | put_page(page); | 98 | put_page(page); |
98 | } | 99 | } |
99 | 100 | ||
100 | /* | 101 | /* |
101 | * Add isolated pages on the list back to the LRU. | 102 | * Add isolated pages on the list back to the LRU. |
102 | * | 103 | * |
103 | * returns the number of pages put back. | 104 | * returns the number of pages put back. |
104 | */ | 105 | */ |
105 | int putback_lru_pages(struct list_head *l) | 106 | int putback_lru_pages(struct list_head *l) |
106 | { | 107 | { |
107 | struct page *page; | 108 | struct page *page; |
108 | struct page *page2; | 109 | struct page *page2; |
109 | int count = 0; | 110 | int count = 0; |
110 | 111 | ||
111 | list_for_each_entry_safe(page, page2, l, lru) { | 112 | list_for_each_entry_safe(page, page2, l, lru) { |
112 | move_to_lru(page); | 113 | move_to_lru(page); |
113 | count++; | 114 | count++; |
114 | } | 115 | } |
115 | return count; | 116 | return count; |
116 | } | 117 | } |
117 | 118 | ||
118 | static inline int is_swap_pte(pte_t pte) | 119 | static inline int is_swap_pte(pte_t pte) |
119 | { | 120 | { |
120 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); | 121 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); |
121 | } | 122 | } |
122 | 123 | ||
123 | /* | 124 | /* |
124 | * Restore a potential migration pte to a working pte entry | 125 | * Restore a potential migration pte to a working pte entry |
125 | */ | 126 | */ |
126 | static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, | 127 | static void remove_migration_pte(struct vm_area_struct *vma, |
127 | struct page *old, struct page *new) | 128 | struct page *old, struct page *new) |
128 | { | 129 | { |
129 | struct mm_struct *mm = vma->vm_mm; | 130 | struct mm_struct *mm = vma->vm_mm; |
130 | swp_entry_t entry; | 131 | swp_entry_t entry; |
131 | pgd_t *pgd; | 132 | pgd_t *pgd; |
132 | pud_t *pud; | 133 | pud_t *pud; |
133 | pmd_t *pmd; | 134 | pmd_t *pmd; |
134 | pte_t *ptep, pte; | 135 | pte_t *ptep, pte; |
135 | spinlock_t *ptl; | 136 | spinlock_t *ptl; |
137 | unsigned long addr = page_address_in_vma(new, vma); | ||
136 | 138 | ||
139 | if (addr == -EFAULT) | ||
140 | return; | ||
141 | |||
137 | pgd = pgd_offset(mm, addr); | 142 | pgd = pgd_offset(mm, addr); |
138 | if (!pgd_present(*pgd)) | 143 | if (!pgd_present(*pgd)) |
139 | return; | 144 | return; |
140 | 145 | ||
141 | pud = pud_offset(pgd, addr); | 146 | pud = pud_offset(pgd, addr); |
142 | if (!pud_present(*pud)) | 147 | if (!pud_present(*pud)) |
143 | return; | 148 | return; |
144 | 149 | ||
145 | pmd = pmd_offset(pud, addr); | 150 | pmd = pmd_offset(pud, addr); |
146 | if (!pmd_present(*pmd)) | 151 | if (!pmd_present(*pmd)) |
147 | return; | 152 | return; |
148 | 153 | ||
149 | ptep = pte_offset_map(pmd, addr); | 154 | ptep = pte_offset_map(pmd, addr); |
150 | 155 | ||
151 | if (!is_swap_pte(*ptep)) { | 156 | if (!is_swap_pte(*ptep)) { |
152 | pte_unmap(ptep); | 157 | pte_unmap(ptep); |
153 | return; | 158 | return; |
154 | } | 159 | } |
155 | 160 | ||
156 | ptl = pte_lockptr(mm, pmd); | 161 | ptl = pte_lockptr(mm, pmd); |
157 | spin_lock(ptl); | 162 | spin_lock(ptl); |
158 | pte = *ptep; | 163 | pte = *ptep; |
159 | if (!is_swap_pte(pte)) | 164 | if (!is_swap_pte(pte)) |
160 | goto out; | 165 | goto out; |
161 | 166 | ||
162 | entry = pte_to_swp_entry(pte); | 167 | entry = pte_to_swp_entry(pte); |
163 | 168 | ||
164 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 169 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) |
165 | goto out; | 170 | goto out; |
166 | 171 | ||
167 | get_page(new); | 172 | get_page(new); |
168 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 173 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
169 | if (is_write_migration_entry(entry)) | 174 | if (is_write_migration_entry(entry)) |
170 | pte = pte_mkwrite(pte); | 175 | pte = pte_mkwrite(pte); |
171 | set_pte_at(mm, addr, ptep, pte); | 176 | set_pte_at(mm, addr, ptep, pte); |
172 | page_add_anon_rmap(new, vma, addr); | 177 | |
178 | if (PageAnon(new)) | ||
179 | page_add_anon_rmap(new, vma, addr); | ||
180 | else | ||
181 | page_add_file_rmap(new); | ||
182 | |||
183 | /* No need to invalidate - it was non-present before */ | ||
184 | update_mmu_cache(vma, addr, pte); | ||
185 | lazy_mmu_prot_update(pte); | ||
186 | |||
173 | out: | 187 | out: |
174 | pte_unmap_unlock(ptep, ptl); | 188 | pte_unmap_unlock(ptep, ptl); |
175 | } | 189 | } |
176 | 190 | ||
177 | /* | 191 | /* |
178 | * Get rid of all migration entries and replace them by | 192 | * Note that remove_file_migration_ptes will only work on regular mappings, |
179 | * references to the indicated page. | 193 | * Nonlinear mappings do not use migration entries. |
180 | * | 194 | */ |
195 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
196 | { | ||
197 | struct vm_area_struct *vma; | ||
198 | struct address_space *mapping = page_mapping(new); | ||
199 | struct prio_tree_iter iter; | ||
200 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
201 | |||
202 | if (!mapping) | ||
203 | return; | ||
204 | |||
205 | spin_lock(&mapping->i_mmap_lock); | ||
206 | |||
207 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
208 | remove_migration_pte(vma, old, new); | ||
209 | |||
210 | spin_unlock(&mapping->i_mmap_lock); | ||
211 | } | ||
212 | |||
213 | /* | ||
181 | * Must hold mmap_sem lock on at least one of the vmas containing | 214 | * Must hold mmap_sem lock on at least one of the vmas containing |
182 | * the page so that the anon_vma cannot vanish. | 215 | * the page so that the anon_vma cannot vanish. |
183 | */ | 216 | */ |
184 | static void remove_migration_ptes(struct page *old, struct page *new) | 217 | static void remove_anon_migration_ptes(struct page *old, struct page *new) |
185 | { | 218 | { |
186 | struct anon_vma *anon_vma; | 219 | struct anon_vma *anon_vma; |
187 | struct vm_area_struct *vma; | 220 | struct vm_area_struct *vma; |
188 | unsigned long mapping; | 221 | unsigned long mapping; |
189 | 222 | ||
190 | mapping = (unsigned long)new->mapping; | 223 | mapping = (unsigned long)new->mapping; |
191 | 224 | ||
192 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | 225 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) |
193 | return; | 226 | return; |
194 | 227 | ||
195 | /* | 228 | /* |
196 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | 229 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. |
197 | */ | 230 | */ |
198 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | 231 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); |
199 | spin_lock(&anon_vma->lock); | 232 | spin_lock(&anon_vma->lock); |
200 | 233 | ||
201 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | 234 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) |
202 | remove_migration_pte(vma, page_address_in_vma(new, vma), | 235 | remove_migration_pte(vma, old, new); |
203 | old, new); | ||
204 | 236 | ||
205 | spin_unlock(&anon_vma->lock); | 237 | spin_unlock(&anon_vma->lock); |
206 | } | 238 | } |
207 | 239 | ||
208 | /* | 240 | /* |
241 | * Get rid of all migration entries and replace them by | ||
242 | * references to the indicated page. | ||
243 | */ | ||
244 | static void remove_migration_ptes(struct page *old, struct page *new) | ||
245 | { | ||
246 | if (PageAnon(new)) | ||
247 | remove_anon_migration_ptes(old, new); | ||
248 | else | ||
249 | remove_file_migration_ptes(old, new); | ||
250 | } | ||
251 | |||
252 | /* | ||
209 | * Something used the pte of a page under migration. We need to | 253 | * Something used the pte of a page under migration. We need to |
210 | * get to the page and wait until migration is finished. | 254 | * get to the page and wait until migration is finished. |
211 | * When we return from this function the fault will be retried. | 255 | * When we return from this function the fault will be retried. |
212 | * | 256 | * |
213 | * This function is called from do_swap_page(). | 257 | * This function is called from do_swap_page(). |
214 | */ | 258 | */ |
215 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 259 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
216 | unsigned long address) | 260 | unsigned long address) |
217 | { | 261 | { |
218 | pte_t *ptep, pte; | 262 | pte_t *ptep, pte; |
219 | spinlock_t *ptl; | 263 | spinlock_t *ptl; |
220 | swp_entry_t entry; | 264 | swp_entry_t entry; |
221 | struct page *page; | 265 | struct page *page; |
222 | 266 | ||
223 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 267 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
224 | pte = *ptep; | 268 | pte = *ptep; |
225 | if (!is_swap_pte(pte)) | 269 | if (!is_swap_pte(pte)) |
226 | goto out; | 270 | goto out; |
227 | 271 | ||
228 | entry = pte_to_swp_entry(pte); | 272 | entry = pte_to_swp_entry(pte); |
229 | if (!is_migration_entry(entry)) | 273 | if (!is_migration_entry(entry)) |
230 | goto out; | 274 | goto out; |
231 | 275 | ||
232 | page = migration_entry_to_page(entry); | 276 | page = migration_entry_to_page(entry); |
233 | 277 | ||
234 | get_page(page); | 278 | get_page(page); |
235 | pte_unmap_unlock(ptep, ptl); | 279 | pte_unmap_unlock(ptep, ptl); |
236 | wait_on_page_locked(page); | 280 | wait_on_page_locked(page); |
237 | put_page(page); | 281 | put_page(page); |
238 | return; | 282 | return; |
239 | out: | 283 | out: |
240 | pte_unmap_unlock(ptep, ptl); | 284 | pte_unmap_unlock(ptep, ptl); |
241 | } | 285 | } |
242 | 286 | ||
243 | /* | 287 | /* |
244 | * Replace the page in the mapping. | 288 | * Replace the page in the mapping. |
245 | * | 289 | * |
246 | * The number of remaining references must be: | 290 | * The number of remaining references must be: |
247 | * 1 for anonymous pages without a mapping | 291 | * 1 for anonymous pages without a mapping |
248 | * 2 for pages with a mapping | 292 | * 2 for pages with a mapping |
249 | * 3 for pages with a mapping and PagePrivate set. | 293 | * 3 for pages with a mapping and PagePrivate set. |
250 | */ | 294 | */ |
251 | static int migrate_page_move_mapping(struct address_space *mapping, | 295 | static int migrate_page_move_mapping(struct address_space *mapping, |
252 | struct page *newpage, struct page *page) | 296 | struct page *newpage, struct page *page) |
253 | { | 297 | { |
254 | struct page **radix_pointer; | 298 | struct page **radix_pointer; |
255 | 299 | ||
256 | if (!mapping) { | 300 | if (!mapping) { |
257 | /* Anonymous page */ | 301 | /* Anonymous page */ |
258 | if (page_count(page) != 1) | 302 | if (page_count(page) != 1) |
259 | return -EAGAIN; | 303 | return -EAGAIN; |
260 | return 0; | 304 | return 0; |
261 | } | 305 | } |
262 | 306 | ||
263 | write_lock_irq(&mapping->tree_lock); | 307 | write_lock_irq(&mapping->tree_lock); |
264 | 308 | ||
265 | radix_pointer = (struct page **)radix_tree_lookup_slot( | 309 | radix_pointer = (struct page **)radix_tree_lookup_slot( |
266 | &mapping->page_tree, | 310 | &mapping->page_tree, |
267 | page_index(page)); | 311 | page_index(page)); |
268 | 312 | ||
269 | if (page_count(page) != 2 + !!PagePrivate(page) || | 313 | if (page_count(page) != 2 + !!PagePrivate(page) || |
270 | *radix_pointer != page) { | 314 | *radix_pointer != page) { |
271 | write_unlock_irq(&mapping->tree_lock); | 315 | write_unlock_irq(&mapping->tree_lock); |
272 | return -EAGAIN; | 316 | return -EAGAIN; |
273 | } | 317 | } |
274 | 318 | ||
275 | /* | 319 | /* |
276 | * Now we know that no one else is looking at the page. | 320 | * Now we know that no one else is looking at the page. |
277 | */ | 321 | */ |
278 | get_page(newpage); | 322 | get_page(newpage); |
279 | #ifdef CONFIG_SWAP | 323 | #ifdef CONFIG_SWAP |
280 | if (PageSwapCache(page)) { | 324 | if (PageSwapCache(page)) { |
281 | SetPageSwapCache(newpage); | 325 | SetPageSwapCache(newpage); |
282 | set_page_private(newpage, page_private(page)); | 326 | set_page_private(newpage, page_private(page)); |
283 | } | 327 | } |
284 | #endif | 328 | #endif |
285 | 329 | ||
286 | *radix_pointer = newpage; | 330 | *radix_pointer = newpage; |
287 | __put_page(page); | 331 | __put_page(page); |
288 | write_unlock_irq(&mapping->tree_lock); | 332 | write_unlock_irq(&mapping->tree_lock); |
289 | 333 | ||
290 | return 0; | 334 | return 0; |
291 | } | 335 | } |
292 | 336 | ||
293 | /* | 337 | /* |
294 | * Copy the page to its new location | 338 | * Copy the page to its new location |
295 | */ | 339 | */ |
296 | static void migrate_page_copy(struct page *newpage, struct page *page) | 340 | static void migrate_page_copy(struct page *newpage, struct page *page) |
297 | { | 341 | { |
298 | copy_highpage(newpage, page); | 342 | copy_highpage(newpage, page); |
299 | 343 | ||
300 | if (PageError(page)) | 344 | if (PageError(page)) |
301 | SetPageError(newpage); | 345 | SetPageError(newpage); |
302 | if (PageReferenced(page)) | 346 | if (PageReferenced(page)) |
303 | SetPageReferenced(newpage); | 347 | SetPageReferenced(newpage); |
304 | if (PageUptodate(page)) | 348 | if (PageUptodate(page)) |
305 | SetPageUptodate(newpage); | 349 | SetPageUptodate(newpage); |
306 | if (PageActive(page)) | 350 | if (PageActive(page)) |
307 | SetPageActive(newpage); | 351 | SetPageActive(newpage); |
308 | if (PageChecked(page)) | 352 | if (PageChecked(page)) |
309 | SetPageChecked(newpage); | 353 | SetPageChecked(newpage); |
310 | if (PageMappedToDisk(page)) | 354 | if (PageMappedToDisk(page)) |
311 | SetPageMappedToDisk(newpage); | 355 | SetPageMappedToDisk(newpage); |
312 | 356 | ||
313 | if (PageDirty(page)) { | 357 | if (PageDirty(page)) { |
314 | clear_page_dirty_for_io(page); | 358 | clear_page_dirty_for_io(page); |
315 | set_page_dirty(newpage); | 359 | set_page_dirty(newpage); |
316 | } | 360 | } |
317 | 361 | ||
318 | #ifdef CONFIG_SWAP | 362 | #ifdef CONFIG_SWAP |
319 | ClearPageSwapCache(page); | 363 | ClearPageSwapCache(page); |
320 | #endif | 364 | #endif |
321 | ClearPageActive(page); | 365 | ClearPageActive(page); |
322 | ClearPagePrivate(page); | 366 | ClearPagePrivate(page); |
323 | set_page_private(page, 0); | 367 | set_page_private(page, 0); |
324 | page->mapping = NULL; | 368 | page->mapping = NULL; |
325 | 369 | ||
326 | /* | 370 | /* |
327 | * If any waiters have accumulated on the new page then | 371 | * If any waiters have accumulated on the new page then |
328 | * wake them up. | 372 | * wake them up. |
329 | */ | 373 | */ |
330 | if (PageWriteback(newpage)) | 374 | if (PageWriteback(newpage)) |
331 | end_page_writeback(newpage); | 375 | end_page_writeback(newpage); |
332 | } | 376 | } |
333 | 377 | ||
334 | /************************************************************ | 378 | /************************************************************ |
335 | * Migration functions | 379 | * Migration functions |
336 | ***********************************************************/ | 380 | ***********************************************************/ |
337 | 381 | ||
338 | /* Always fail migration. Used for mappings that are not movable */ | 382 | /* Always fail migration. Used for mappings that are not movable */ |
339 | int fail_migrate_page(struct address_space *mapping, | 383 | int fail_migrate_page(struct address_space *mapping, |
340 | struct page *newpage, struct page *page) | 384 | struct page *newpage, struct page *page) |
341 | { | 385 | { |
342 | return -EIO; | 386 | return -EIO; |
343 | } | 387 | } |
344 | EXPORT_SYMBOL(fail_migrate_page); | 388 | EXPORT_SYMBOL(fail_migrate_page); |
345 | 389 | ||
346 | /* | 390 | /* |
347 | * Common logic to directly migrate a single page suitable for | 391 | * Common logic to directly migrate a single page suitable for |
348 | * pages that do not use PagePrivate. | 392 | * pages that do not use PagePrivate. |
349 | * | 393 | * |
350 | * Pages are locked upon entry and exit. | 394 | * Pages are locked upon entry and exit. |
351 | */ | 395 | */ |
352 | int migrate_page(struct address_space *mapping, | 396 | int migrate_page(struct address_space *mapping, |
353 | struct page *newpage, struct page *page) | 397 | struct page *newpage, struct page *page) |
354 | { | 398 | { |
355 | int rc; | 399 | int rc; |
356 | 400 | ||
357 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 401 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
358 | 402 | ||
359 | rc = migrate_page_move_mapping(mapping, newpage, page); | 403 | rc = migrate_page_move_mapping(mapping, newpage, page); |
360 | 404 | ||
361 | if (rc) | 405 | if (rc) |
362 | return rc; | 406 | return rc; |
363 | 407 | ||
364 | migrate_page_copy(newpage, page); | 408 | migrate_page_copy(newpage, page); |
365 | return 0; | 409 | return 0; |
366 | } | 410 | } |
367 | EXPORT_SYMBOL(migrate_page); | 411 | EXPORT_SYMBOL(migrate_page); |
368 | 412 | ||
369 | /* | 413 | /* |
370 | * Migration function for pages with buffers. This function can only be used | 414 | * Migration function for pages with buffers. This function can only be used |
371 | * if the underlying filesystem guarantees that no other references to "page" | 415 | * if the underlying filesystem guarantees that no other references to "page" |
372 | * exist. | 416 | * exist. |
373 | */ | 417 | */ |
374 | int buffer_migrate_page(struct address_space *mapping, | 418 | int buffer_migrate_page(struct address_space *mapping, |
375 | struct page *newpage, struct page *page) | 419 | struct page *newpage, struct page *page) |
376 | { | 420 | { |
377 | struct buffer_head *bh, *head; | 421 | struct buffer_head *bh, *head; |
378 | int rc; | 422 | int rc; |
379 | 423 | ||
380 | if (!page_has_buffers(page)) | 424 | if (!page_has_buffers(page)) |
381 | return migrate_page(mapping, newpage, page); | 425 | return migrate_page(mapping, newpage, page); |
382 | 426 | ||
383 | head = page_buffers(page); | 427 | head = page_buffers(page); |
384 | 428 | ||
385 | rc = migrate_page_move_mapping(mapping, newpage, page); | 429 | rc = migrate_page_move_mapping(mapping, newpage, page); |
386 | 430 | ||
387 | if (rc) | 431 | if (rc) |
388 | return rc; | 432 | return rc; |
389 | 433 | ||
390 | bh = head; | 434 | bh = head; |
391 | do { | 435 | do { |
392 | get_bh(bh); | 436 | get_bh(bh); |
393 | lock_buffer(bh); | 437 | lock_buffer(bh); |
394 | bh = bh->b_this_page; | 438 | bh = bh->b_this_page; |
395 | 439 | ||
396 | } while (bh != head); | 440 | } while (bh != head); |
397 | 441 | ||
398 | ClearPagePrivate(page); | 442 | ClearPagePrivate(page); |
399 | set_page_private(newpage, page_private(page)); | 443 | set_page_private(newpage, page_private(page)); |
400 | set_page_private(page, 0); | 444 | set_page_private(page, 0); |
401 | put_page(page); | 445 | put_page(page); |
402 | get_page(newpage); | 446 | get_page(newpage); |
403 | 447 | ||
404 | bh = head; | 448 | bh = head; |
405 | do { | 449 | do { |
406 | set_bh_page(bh, newpage, bh_offset(bh)); | 450 | set_bh_page(bh, newpage, bh_offset(bh)); |
407 | bh = bh->b_this_page; | 451 | bh = bh->b_this_page; |
408 | 452 | ||
409 | } while (bh != head); | 453 | } while (bh != head); |
410 | 454 | ||
411 | SetPagePrivate(newpage); | 455 | SetPagePrivate(newpage); |
412 | 456 | ||
413 | migrate_page_copy(newpage, page); | 457 | migrate_page_copy(newpage, page); |
414 | 458 | ||
415 | bh = head; | 459 | bh = head; |
416 | do { | 460 | do { |
417 | unlock_buffer(bh); | 461 | unlock_buffer(bh); |
418 | put_bh(bh); | 462 | put_bh(bh); |
419 | bh = bh->b_this_page; | 463 | bh = bh->b_this_page; |
420 | 464 | ||
421 | } while (bh != head); | 465 | } while (bh != head); |
422 | 466 | ||
423 | return 0; | 467 | return 0; |
424 | } | 468 | } |
425 | EXPORT_SYMBOL(buffer_migrate_page); | 469 | EXPORT_SYMBOL(buffer_migrate_page); |
426 | 470 | ||
427 | static int fallback_migrate_page(struct address_space *mapping, | 471 | /* |
428 | struct page *newpage, struct page *page) | 472 | * Writeback a page to clean the dirty state |
473 | */ | ||
474 | static int writeout(struct address_space *mapping, struct page *page) | ||
429 | { | 475 | { |
476 | struct writeback_control wbc = { | ||
477 | .sync_mode = WB_SYNC_NONE, | ||
478 | .nr_to_write = 1, | ||
479 | .range_start = 0, | ||
480 | .range_end = LLONG_MAX, | ||
481 | .nonblocking = 1, | ||
482 | .for_reclaim = 1 | ||
483 | }; | ||
484 | int rc; | ||
485 | |||
486 | if (!mapping->a_ops->writepage) | ||
487 | /* No write method for the address space */ | ||
488 | return -EINVAL; | ||
489 | |||
490 | if (!clear_page_dirty_for_io(page)) | ||
491 | /* Someone else already triggered a write */ | ||
492 | return -EAGAIN; | ||
493 | |||
430 | /* | 494 | /* |
431 | * Default handling if a filesystem does not provide | 495 | * A dirty page may imply that the underlying filesystem has |
432 | * a migration function. We can only migrate clean | 496 | * the page on some queue. So the page must be clean for |
433 | * pages so try to write out any dirty pages first. | 497 | * migration. Writeout may mean we loose the lock and the |
498 | * page state is no longer what we checked for earlier. | ||
499 | * At this point we know that the migration attempt cannot | ||
500 | * be successful. | ||
434 | */ | 501 | */ |
435 | if (PageDirty(page)) { | 502 | remove_migration_ptes(page, page); |
436 | switch (pageout(page, mapping)) { | ||
437 | case PAGE_KEEP: | ||
438 | case PAGE_ACTIVATE: | ||
439 | return -EAGAIN; | ||
440 | 503 | ||
441 | case PAGE_SUCCESS: | 504 | rc = mapping->a_ops->writepage(page, &wbc); |
442 | /* Relock since we lost the lock */ | 505 | if (rc < 0) |
443 | lock_page(page); | 506 | /* I/O Error writing */ |
444 | /* Must retry since page state may have changed */ | 507 | return -EIO; |
445 | return -EAGAIN; | ||
446 | 508 | ||
447 | case PAGE_CLEAN: | 509 | if (rc != AOP_WRITEPAGE_ACTIVATE) |
448 | ; /* try to migrate the page below */ | 510 | /* unlocked. Relock */ |
449 | } | 511 | lock_page(page); |
450 | } | 512 | |
513 | return -EAGAIN; | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Default handling if a filesystem does not provide a migration function. | ||
518 | */ | ||
519 | static int fallback_migrate_page(struct address_space *mapping, | ||
520 | struct page *newpage, struct page *page) | ||
521 | { | ||
522 | if (PageDirty(page)) | ||
523 | return writeout(mapping, page); | ||
451 | 524 | ||
452 | /* | 525 | /* |
453 | * Buffers may be managed in a filesystem specific way. | 526 | * Buffers may be managed in a filesystem specific way. |
454 | * We must have no buffers or drop them. | 527 | * We must have no buffers or drop them. |
455 | */ | 528 | */ |
456 | if (page_has_buffers(page) && | 529 | if (page_has_buffers(page) && |
457 | !try_to_release_page(page, GFP_KERNEL)) | 530 | !try_to_release_page(page, GFP_KERNEL)) |
458 | return -EAGAIN; | 531 | return -EAGAIN; |
459 | 532 | ||
460 | return migrate_page(mapping, newpage, page); | 533 | return migrate_page(mapping, newpage, page); |
461 | } | 534 | } |
462 | 535 | ||
463 | /* | 536 | /* |
464 | * migrate_pages | 537 | * migrate_pages |
465 | * | 538 | * |
466 | * Two lists are passed to this function. The first list | 539 | * Two lists are passed to this function. The first list |
467 | * contains the pages isolated from the LRU to be migrated. | 540 | * contains the pages isolated from the LRU to be migrated. |
468 | * The second list contains new pages that the pages isolated | 541 | * The second list contains new pages that the pages isolated |
469 | * can be moved to. | 542 | * can be moved to. |
470 | * | 543 | * |
471 | * The function returns after 10 attempts or if no pages | 544 | * The function returns after 10 attempts or if no pages |
472 | * are movable anymore because to has become empty | 545 | * are movable anymore because to has become empty |
473 | * or no retryable pages exist anymore. | 546 | * or no retryable pages exist anymore. |
474 | * | 547 | * |
475 | * Return: Number of pages not migrated when "to" ran empty. | 548 | * Return: Number of pages not migrated when "to" ran empty. |
476 | */ | 549 | */ |
477 | int migrate_pages(struct list_head *from, struct list_head *to, | 550 | int migrate_pages(struct list_head *from, struct list_head *to, |
478 | struct list_head *moved, struct list_head *failed) | 551 | struct list_head *moved, struct list_head *failed) |
479 | { | 552 | { |
480 | int retry; | 553 | int retry; |
481 | int nr_failed = 0; | 554 | int nr_failed = 0; |
482 | int pass = 0; | 555 | int pass = 0; |
483 | struct page *page; | 556 | struct page *page; |
484 | struct page *page2; | 557 | struct page *page2; |
485 | int swapwrite = current->flags & PF_SWAPWRITE; | 558 | int swapwrite = current->flags & PF_SWAPWRITE; |
486 | int rc; | 559 | int rc; |
487 | 560 | ||
488 | if (!swapwrite) | 561 | if (!swapwrite) |
489 | current->flags |= PF_SWAPWRITE; | 562 | current->flags |= PF_SWAPWRITE; |
490 | 563 | ||
491 | redo: | 564 | redo: |
492 | retry = 0; | 565 | retry = 0; |
493 | 566 | ||
494 | list_for_each_entry_safe(page, page2, from, lru) { | 567 | list_for_each_entry_safe(page, page2, from, lru) { |
495 | struct page *newpage = NULL; | 568 | struct page *newpage = NULL; |
496 | struct address_space *mapping; | 569 | struct address_space *mapping; |
497 | 570 | ||
498 | cond_resched(); | 571 | cond_resched(); |
499 | 572 | ||
500 | rc = 0; | 573 | rc = 0; |
501 | if (page_count(page) == 1) | 574 | if (page_count(page) == 1) |
502 | /* page was freed from under us. So we are done. */ | 575 | /* page was freed from under us. So we are done. */ |
503 | goto next; | 576 | goto next; |
504 | 577 | ||
505 | if (to && list_empty(to)) | 578 | if (to && list_empty(to)) |
506 | break; | 579 | break; |
507 | 580 | ||
508 | /* | 581 | /* |
509 | * Skip locked pages during the first two passes to give the | 582 | * Skip locked pages during the first two passes to give the |
510 | * functions holding the lock time to release the page. Later we | 583 | * functions holding the lock time to release the page. Later we |
511 | * use lock_page() to have a higher chance of acquiring the | 584 | * use lock_page() to have a higher chance of acquiring the |
512 | * lock. | 585 | * lock. |
513 | */ | 586 | */ |
514 | rc = -EAGAIN; | 587 | rc = -EAGAIN; |
515 | if (pass > 2) | 588 | if (pass > 2) |
516 | lock_page(page); | 589 | lock_page(page); |
517 | else | 590 | else |
518 | if (TestSetPageLocked(page)) | 591 | if (TestSetPageLocked(page)) |
519 | goto next; | 592 | goto next; |
520 | 593 | ||
521 | /* | 594 | /* |
522 | * Only wait on writeback if we have already done a pass where | 595 | * Only wait on writeback if we have already done a pass where |
523 | * we we may have triggered writeouts for lots of pages. | 596 | * we we may have triggered writeouts for lots of pages. |
524 | */ | 597 | */ |
525 | if (pass > 0) | 598 | if (pass > 0) |
526 | wait_on_page_writeback(page); | 599 | wait_on_page_writeback(page); |
527 | else | 600 | else |
528 | if (PageWriteback(page)) | 601 | if (PageWriteback(page)) |
529 | goto unlock_page; | 602 | goto unlock_page; |
530 | 603 | ||
531 | /* | 604 | /* |
532 | * Establish migration ptes or remove ptes | 605 | * Establish migration ptes or remove ptes |
533 | */ | 606 | */ |
534 | rc = -EPERM; | 607 | rc = -EPERM; |
535 | if (try_to_unmap(page, 1) == SWAP_FAIL) | 608 | if (try_to_unmap(page, 1) == SWAP_FAIL) |
536 | /* A vma has VM_LOCKED set -> permanent failure */ | 609 | /* A vma has VM_LOCKED set -> permanent failure */ |
537 | goto unlock_page; | 610 | goto unlock_page; |
538 | 611 | ||
539 | rc = -EAGAIN; | 612 | rc = -EAGAIN; |
540 | if (page_mapped(page)) | 613 | if (page_mapped(page)) |
541 | goto unlock_page; | 614 | goto unlock_page; |
542 | 615 | ||
543 | newpage = lru_to_page(to); | 616 | newpage = lru_to_page(to); |
544 | lock_page(newpage); | 617 | lock_page(newpage); |
545 | /* Prepare mapping for the new page.*/ | 618 | /* Prepare mapping for the new page.*/ |
546 | newpage->index = page->index; | 619 | newpage->index = page->index; |
547 | newpage->mapping = page->mapping; | 620 | newpage->mapping = page->mapping; |
548 | 621 | ||
549 | /* | 622 | /* |
550 | * Pages are properly locked and writeback is complete. | 623 | * Pages are properly locked and writeback is complete. |
551 | * Try to migrate the page. | 624 | * Try to migrate the page. |
552 | */ | 625 | */ |
553 | mapping = page_mapping(page); | 626 | mapping = page_mapping(page); |
554 | if (!mapping) | 627 | if (!mapping) |
555 | rc = migrate_page(mapping, newpage, page); | 628 | rc = migrate_page(mapping, newpage, page); |
556 | 629 | ||
557 | else if (mapping->a_ops->migratepage) | 630 | else if (mapping->a_ops->migratepage) |
558 | /* | 631 | /* |
559 | * Most pages have a mapping and most filesystems | 632 | * Most pages have a mapping and most filesystems |
560 | * should provide a migration function. Anonymous | 633 | * should provide a migration function. Anonymous |
561 | * pages are part of swap space which also has its | 634 | * pages are part of swap space which also has its |
562 | * own migration function. This is the most common | 635 | * own migration function. This is the most common |
563 | * path for page migration. | 636 | * path for page migration. |
564 | */ | 637 | */ |
565 | rc = mapping->a_ops->migratepage(mapping, | 638 | rc = mapping->a_ops->migratepage(mapping, |
566 | newpage, page); | 639 | newpage, page); |
567 | else | 640 | else |
568 | rc = fallback_migrate_page(mapping, newpage, page); | 641 | rc = fallback_migrate_page(mapping, newpage, page); |
569 | 642 | ||
570 | if (!rc) | 643 | if (!rc) |
571 | remove_migration_ptes(page, newpage); | 644 | remove_migration_ptes(page, newpage); |
572 | 645 | ||
573 | unlock_page(newpage); | 646 | unlock_page(newpage); |
574 | 647 | ||
575 | unlock_page: | 648 | unlock_page: |
576 | if (rc) | 649 | if (rc) |
577 | remove_migration_ptes(page, page); | 650 | remove_migration_ptes(page, page); |
578 | 651 | ||
579 | unlock_page(page); | 652 | unlock_page(page); |
580 | 653 | ||
581 | next: | 654 | next: |
582 | if (rc) { | 655 | if (rc) { |
583 | if (newpage) | 656 | if (newpage) |
584 | newpage->mapping = NULL; | 657 | newpage->mapping = NULL; |
585 | 658 | ||
586 | if (rc == -EAGAIN) | 659 | if (rc == -EAGAIN) |
587 | retry++; | 660 | retry++; |
588 | else { | 661 | else { |
589 | /* Permanent failure */ | 662 | /* Permanent failure */ |
590 | list_move(&page->lru, failed); | 663 | list_move(&page->lru, failed); |
591 | nr_failed++; | 664 | nr_failed++; |
592 | } | 665 | } |
593 | } else { | 666 | } else { |
594 | if (newpage) { | 667 | if (newpage) { |
595 | /* Successful migration. Return page to LRU */ | 668 | /* Successful migration. Return page to LRU */ |
596 | move_to_lru(newpage); | 669 | move_to_lru(newpage); |
597 | } | 670 | } |
598 | list_move(&page->lru, moved); | 671 | list_move(&page->lru, moved); |
599 | } | 672 | } |
600 | } | 673 | } |
601 | if (retry && pass++ < 10) | 674 | if (retry && pass++ < 10) |
602 | goto redo; | 675 | goto redo; |
603 | 676 | ||
604 | if (!swapwrite) | 677 | if (!swapwrite) |
605 | current->flags &= ~PF_SWAPWRITE; | 678 | current->flags &= ~PF_SWAPWRITE; |
606 | 679 | ||
607 | return nr_failed + retry; | 680 | return nr_failed + retry; |
608 | } | 681 | } |
609 | 682 | ||
610 | /* | 683 | /* |
611 | * Migrate the list 'pagelist' of pages to a certain destination. | 684 | * Migrate the list 'pagelist' of pages to a certain destination. |
612 | * | 685 | * |
613 | * Specify destination with either non-NULL vma or dest_node >= 0 | 686 | * Specify destination with either non-NULL vma or dest_node >= 0 |
614 | * Return the number of pages not migrated or error code | 687 | * Return the number of pages not migrated or error code |
615 | */ | 688 | */ |
616 | int migrate_pages_to(struct list_head *pagelist, | 689 | int migrate_pages_to(struct list_head *pagelist, |
617 | struct vm_area_struct *vma, int dest) | 690 | struct vm_area_struct *vma, int dest) |
618 | { | 691 | { |
619 | LIST_HEAD(newlist); | 692 | LIST_HEAD(newlist); |
620 | LIST_HEAD(moved); | 693 | LIST_HEAD(moved); |
621 | LIST_HEAD(failed); | 694 | LIST_HEAD(failed); |
622 | int err = 0; | 695 | int err = 0; |
623 | unsigned long offset = 0; | 696 | unsigned long offset = 0; |
624 | int nr_pages; | 697 | int nr_pages; |
625 | struct page *page; | 698 | struct page *page; |
626 | struct list_head *p; | 699 | struct list_head *p; |
627 | 700 | ||
628 | redo: | 701 | redo: |
629 | nr_pages = 0; | 702 | nr_pages = 0; |
630 | list_for_each(p, pagelist) { | 703 | list_for_each(p, pagelist) { |
631 | if (vma) { | 704 | if (vma) { |
632 | /* | 705 | /* |
633 | * The address passed to alloc_page_vma is used to | 706 | * The address passed to alloc_page_vma is used to |
634 | * generate the proper interleave behavior. We fake | 707 | * generate the proper interleave behavior. We fake |
635 | * the address here by an increasing offset in order | 708 | * the address here by an increasing offset in order |
636 | * to get the proper distribution of pages. | 709 | * to get the proper distribution of pages. |
637 | * | 710 | * |
638 | * No decision has been made as to which page | 711 | * No decision has been made as to which page |
639 | * a certain old page is moved to so we cannot | 712 | * a certain old page is moved to so we cannot |
640 | * specify the correct address. | 713 | * specify the correct address. |
641 | */ | 714 | */ |
642 | page = alloc_page_vma(GFP_HIGHUSER, vma, | 715 | page = alloc_page_vma(GFP_HIGHUSER, vma, |
643 | offset + vma->vm_start); | 716 | offset + vma->vm_start); |
644 | offset += PAGE_SIZE; | 717 | offset += PAGE_SIZE; |
645 | } | 718 | } |
646 | else | 719 | else |
647 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | 720 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); |
648 | 721 | ||
649 | if (!page) { | 722 | if (!page) { |
650 | err = -ENOMEM; | 723 | err = -ENOMEM; |
651 | goto out; | 724 | goto out; |
652 | } | 725 | } |
653 | list_add_tail(&page->lru, &newlist); | 726 | list_add_tail(&page->lru, &newlist); |
654 | nr_pages++; | 727 | nr_pages++; |
655 | if (nr_pages > MIGRATE_CHUNK_SIZE) | 728 | if (nr_pages > MIGRATE_CHUNK_SIZE) |
656 | break; | 729 | break; |
657 | } | 730 | } |
658 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | 731 | err = migrate_pages(pagelist, &newlist, &moved, &failed); |
659 | 732 | ||
660 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | 733 | putback_lru_pages(&moved); /* Call release pages instead ?? */ |
661 | 734 | ||
662 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | 735 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) |
663 | goto redo; | 736 | goto redo; |
664 | out: | 737 | out: |
665 | /* Return leftover allocated pages */ | 738 | /* Return leftover allocated pages */ |
666 | while (!list_empty(&newlist)) { | 739 | while (!list_empty(&newlist)) { |
667 | page = list_entry(newlist.next, struct page, lru); | 740 | page = list_entry(newlist.next, struct page, lru); |
668 | list_del(&page->lru); | 741 | list_del(&page->lru); |
669 | __free_page(page); | 742 | __free_page(page); |
670 | } | 743 | } |
671 | list_splice(&failed, pagelist); | 744 | list_splice(&failed, pagelist); |
672 | if (err < 0) | 745 | if (err < 0) |
673 | return err; | 746 | return err; |
674 | 747 | ||
675 | /* Calculate number of leftover pages */ | 748 | /* Calculate number of leftover pages */ |
mm/rmap.c
1 | /* | 1 | /* |
2 | * mm/rmap.c - physical to virtual reverse mappings | 2 | * mm/rmap.c - physical to virtual reverse mappings |
3 | * | 3 | * |
4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> | 4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> |
5 | * Released under the General Public License (GPL). | 5 | * Released under the General Public License (GPL). |
6 | * | 6 | * |
7 | * Simple, low overhead reverse mapping scheme. | 7 | * Simple, low overhead reverse mapping scheme. |
8 | * Please try to keep this thing as modular as possible. | 8 | * Please try to keep this thing as modular as possible. |
9 | * | 9 | * |
10 | * Provides methods for unmapping each kind of mapped page: | 10 | * Provides methods for unmapping each kind of mapped page: |
11 | * the anon methods track anonymous pages, and | 11 | * the anon methods track anonymous pages, and |
12 | * the file methods track pages belonging to an inode. | 12 | * the file methods track pages belonging to an inode. |
13 | * | 13 | * |
14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 | 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 |
15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 | 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 |
16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 | 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 |
17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 | 17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 |
18 | */ | 18 | */ |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem | 24 | * inode->i_alloc_sem |
25 | * | 25 | * |
26 | * When a page fault occurs in writing from user to file, down_read | 26 | * When a page fault occurs in writing from user to file, down_read |
27 | * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within | 27 | * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within |
28 | * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never | 28 | * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never |
29 | * taken together; in truncation, i_mutex is taken outermost. | 29 | * taken together; in truncation, i_mutex is taken outermost. |
30 | * | 30 | * |
31 | * mm->mmap_sem | 31 | * mm->mmap_sem |
32 | * page->flags PG_locked (lock_page) | 32 | * page->flags PG_locked (lock_page) |
33 | * mapping->i_mmap_lock | 33 | * mapping->i_mmap_lock |
34 | * anon_vma->lock | 34 | * anon_vma->lock |
35 | * mm->page_table_lock or pte_lock | 35 | * mm->page_table_lock or pte_lock |
36 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 36 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
37 | * swap_lock (in swap_duplicate, swap_info_get) | 37 | * swap_lock (in swap_duplicate, swap_info_get) |
38 | * mmlist_lock (in mmput, drain_mmlist and others) | 38 | * mmlist_lock (in mmput, drain_mmlist and others) |
39 | * mapping->private_lock (in __set_page_dirty_buffers) | 39 | * mapping->private_lock (in __set_page_dirty_buffers) |
40 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | 40 | * inode_lock (in set_page_dirty's __mark_inode_dirty) |
41 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 41 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
42 | * mapping->tree_lock (widely used, in set_page_dirty, | 42 | * mapping->tree_lock (widely used, in set_page_dirty, |
43 | * in arch-dependent flush_dcache_mmap_lock, | 43 | * in arch-dependent flush_dcache_mmap_lock, |
44 | * within inode_lock in __sync_single_inode) | 44 | * within inode_lock in __sync_single_inode) |
45 | */ | 45 | */ |
46 | 46 | ||
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/pagemap.h> | 48 | #include <linux/pagemap.h> |
49 | #include <linux/swap.h> | 49 | #include <linux/swap.h> |
50 | #include <linux/swapops.h> | 50 | #include <linux/swapops.h> |
51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
54 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
55 | #include <linux/module.h> | 55 | #include <linux/module.h> |
56 | 56 | ||
57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
58 | 58 | ||
59 | struct kmem_cache *anon_vma_cachep; | 59 | struct kmem_cache *anon_vma_cachep; |
60 | 60 | ||
61 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | 61 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) |
62 | { | 62 | { |
63 | #ifdef CONFIG_DEBUG_VM | 63 | #ifdef CONFIG_DEBUG_VM |
64 | struct anon_vma *anon_vma = find_vma->anon_vma; | 64 | struct anon_vma *anon_vma = find_vma->anon_vma; |
65 | struct vm_area_struct *vma; | 65 | struct vm_area_struct *vma; |
66 | unsigned int mapcount = 0; | 66 | unsigned int mapcount = 0; |
67 | int found = 0; | 67 | int found = 0; |
68 | 68 | ||
69 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 69 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
70 | mapcount++; | 70 | mapcount++; |
71 | BUG_ON(mapcount > 100000); | 71 | BUG_ON(mapcount > 100000); |
72 | if (vma == find_vma) | 72 | if (vma == find_vma) |
73 | found = 1; | 73 | found = 1; |
74 | } | 74 | } |
75 | BUG_ON(!found); | 75 | BUG_ON(!found); |
76 | #endif | 76 | #endif |
77 | } | 77 | } |
78 | 78 | ||
79 | /* This must be called under the mmap_sem. */ | 79 | /* This must be called under the mmap_sem. */ |
80 | int anon_vma_prepare(struct vm_area_struct *vma) | 80 | int anon_vma_prepare(struct vm_area_struct *vma) |
81 | { | 81 | { |
82 | struct anon_vma *anon_vma = vma->anon_vma; | 82 | struct anon_vma *anon_vma = vma->anon_vma; |
83 | 83 | ||
84 | might_sleep(); | 84 | might_sleep(); |
85 | if (unlikely(!anon_vma)) { | 85 | if (unlikely(!anon_vma)) { |
86 | struct mm_struct *mm = vma->vm_mm; | 86 | struct mm_struct *mm = vma->vm_mm; |
87 | struct anon_vma *allocated, *locked; | 87 | struct anon_vma *allocated, *locked; |
88 | 88 | ||
89 | anon_vma = find_mergeable_anon_vma(vma); | 89 | anon_vma = find_mergeable_anon_vma(vma); |
90 | if (anon_vma) { | 90 | if (anon_vma) { |
91 | allocated = NULL; | 91 | allocated = NULL; |
92 | locked = anon_vma; | 92 | locked = anon_vma; |
93 | spin_lock(&locked->lock); | 93 | spin_lock(&locked->lock); |
94 | } else { | 94 | } else { |
95 | anon_vma = anon_vma_alloc(); | 95 | anon_vma = anon_vma_alloc(); |
96 | if (unlikely(!anon_vma)) | 96 | if (unlikely(!anon_vma)) |
97 | return -ENOMEM; | 97 | return -ENOMEM; |
98 | allocated = anon_vma; | 98 | allocated = anon_vma; |
99 | locked = NULL; | 99 | locked = NULL; |
100 | } | 100 | } |
101 | 101 | ||
102 | /* page_table_lock to protect against threads */ | 102 | /* page_table_lock to protect against threads */ |
103 | spin_lock(&mm->page_table_lock); | 103 | spin_lock(&mm->page_table_lock); |
104 | if (likely(!vma->anon_vma)) { | 104 | if (likely(!vma->anon_vma)) { |
105 | vma->anon_vma = anon_vma; | 105 | vma->anon_vma = anon_vma; |
106 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 106 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
107 | allocated = NULL; | 107 | allocated = NULL; |
108 | } | 108 | } |
109 | spin_unlock(&mm->page_table_lock); | 109 | spin_unlock(&mm->page_table_lock); |
110 | 110 | ||
111 | if (locked) | 111 | if (locked) |
112 | spin_unlock(&locked->lock); | 112 | spin_unlock(&locked->lock); |
113 | if (unlikely(allocated)) | 113 | if (unlikely(allocated)) |
114 | anon_vma_free(allocated); | 114 | anon_vma_free(allocated); |
115 | } | 115 | } |
116 | return 0; | 116 | return 0; |
117 | } | 117 | } |
118 | 118 | ||
119 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 119 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) |
120 | { | 120 | { |
121 | BUG_ON(vma->anon_vma != next->anon_vma); | 121 | BUG_ON(vma->anon_vma != next->anon_vma); |
122 | list_del(&next->anon_vma_node); | 122 | list_del(&next->anon_vma_node); |
123 | } | 123 | } |
124 | 124 | ||
125 | void __anon_vma_link(struct vm_area_struct *vma) | 125 | void __anon_vma_link(struct vm_area_struct *vma) |
126 | { | 126 | { |
127 | struct anon_vma *anon_vma = vma->anon_vma; | 127 | struct anon_vma *anon_vma = vma->anon_vma; |
128 | 128 | ||
129 | if (anon_vma) { | 129 | if (anon_vma) { |
130 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 130 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
131 | validate_anon_vma(vma); | 131 | validate_anon_vma(vma); |
132 | } | 132 | } |
133 | } | 133 | } |
134 | 134 | ||
135 | void anon_vma_link(struct vm_area_struct *vma) | 135 | void anon_vma_link(struct vm_area_struct *vma) |
136 | { | 136 | { |
137 | struct anon_vma *anon_vma = vma->anon_vma; | 137 | struct anon_vma *anon_vma = vma->anon_vma; |
138 | 138 | ||
139 | if (anon_vma) { | 139 | if (anon_vma) { |
140 | spin_lock(&anon_vma->lock); | 140 | spin_lock(&anon_vma->lock); |
141 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 141 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
142 | validate_anon_vma(vma); | 142 | validate_anon_vma(vma); |
143 | spin_unlock(&anon_vma->lock); | 143 | spin_unlock(&anon_vma->lock); |
144 | } | 144 | } |
145 | } | 145 | } |
146 | 146 | ||
147 | void anon_vma_unlink(struct vm_area_struct *vma) | 147 | void anon_vma_unlink(struct vm_area_struct *vma) |
148 | { | 148 | { |
149 | struct anon_vma *anon_vma = vma->anon_vma; | 149 | struct anon_vma *anon_vma = vma->anon_vma; |
150 | int empty; | 150 | int empty; |
151 | 151 | ||
152 | if (!anon_vma) | 152 | if (!anon_vma) |
153 | return; | 153 | return; |
154 | 154 | ||
155 | spin_lock(&anon_vma->lock); | 155 | spin_lock(&anon_vma->lock); |
156 | validate_anon_vma(vma); | 156 | validate_anon_vma(vma); |
157 | list_del(&vma->anon_vma_node); | 157 | list_del(&vma->anon_vma_node); |
158 | 158 | ||
159 | /* We must garbage collect the anon_vma if it's empty */ | 159 | /* We must garbage collect the anon_vma if it's empty */ |
160 | empty = list_empty(&anon_vma->head); | 160 | empty = list_empty(&anon_vma->head); |
161 | spin_unlock(&anon_vma->lock); | 161 | spin_unlock(&anon_vma->lock); |
162 | 162 | ||
163 | if (empty) | 163 | if (empty) |
164 | anon_vma_free(anon_vma); | 164 | anon_vma_free(anon_vma); |
165 | } | 165 | } |
166 | 166 | ||
167 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, | 167 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, |
168 | unsigned long flags) | 168 | unsigned long flags) |
169 | { | 169 | { |
170 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 170 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == |
171 | SLAB_CTOR_CONSTRUCTOR) { | 171 | SLAB_CTOR_CONSTRUCTOR) { |
172 | struct anon_vma *anon_vma = data; | 172 | struct anon_vma *anon_vma = data; |
173 | 173 | ||
174 | spin_lock_init(&anon_vma->lock); | 174 | spin_lock_init(&anon_vma->lock); |
175 | INIT_LIST_HEAD(&anon_vma->head); | 175 | INIT_LIST_HEAD(&anon_vma->head); |
176 | } | 176 | } |
177 | } | 177 | } |
178 | 178 | ||
179 | void __init anon_vma_init(void) | 179 | void __init anon_vma_init(void) |
180 | { | 180 | { |
181 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 181 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
182 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); | 182 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * Getting a lock on a stable anon_vma from a page off the LRU is | 186 | * Getting a lock on a stable anon_vma from a page off the LRU is |
187 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 187 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
188 | */ | 188 | */ |
189 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 189 | static struct anon_vma *page_lock_anon_vma(struct page *page) |
190 | { | 190 | { |
191 | struct anon_vma *anon_vma = NULL; | 191 | struct anon_vma *anon_vma = NULL; |
192 | unsigned long anon_mapping; | 192 | unsigned long anon_mapping; |
193 | 193 | ||
194 | rcu_read_lock(); | 194 | rcu_read_lock(); |
195 | anon_mapping = (unsigned long) page->mapping; | 195 | anon_mapping = (unsigned long) page->mapping; |
196 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 196 | if (!(anon_mapping & PAGE_MAPPING_ANON)) |
197 | goto out; | 197 | goto out; |
198 | if (!page_mapped(page)) | 198 | if (!page_mapped(page)) |
199 | goto out; | 199 | goto out; |
200 | 200 | ||
201 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 201 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
202 | spin_lock(&anon_vma->lock); | 202 | spin_lock(&anon_vma->lock); |
203 | out: | 203 | out: |
204 | rcu_read_unlock(); | 204 | rcu_read_unlock(); |
205 | return anon_vma; | 205 | return anon_vma; |
206 | } | 206 | } |
207 | 207 | ||
208 | /* | 208 | /* |
209 | * At what user virtual address is page expected in vma? | 209 | * At what user virtual address is page expected in vma? |
210 | */ | 210 | */ |
211 | static inline unsigned long | 211 | static inline unsigned long |
212 | vma_address(struct page *page, struct vm_area_struct *vma) | 212 | vma_address(struct page *page, struct vm_area_struct *vma) |
213 | { | 213 | { |
214 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 214 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
215 | unsigned long address; | 215 | unsigned long address; |
216 | 216 | ||
217 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 217 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
218 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 218 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { |
219 | /* page should be within any vma from prio_tree_next */ | 219 | /* page should be within any vma from prio_tree_next */ |
220 | BUG_ON(!PageAnon(page)); | 220 | BUG_ON(!PageAnon(page)); |
221 | return -EFAULT; | 221 | return -EFAULT; |
222 | } | 222 | } |
223 | return address; | 223 | return address; |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | 226 | /* |
227 | * At what user virtual address is page expected in vma? checking that the | 227 | * At what user virtual address is page expected in vma? checking that the |
228 | * page matches the vma: currently only used on anon pages, by unuse_vma; | 228 | * page matches the vma: currently only used on anon pages, by unuse_vma; |
229 | */ | 229 | */ |
230 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 230 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
231 | { | 231 | { |
232 | if (PageAnon(page)) { | 232 | if (PageAnon(page)) { |
233 | if ((void *)vma->anon_vma != | 233 | if ((void *)vma->anon_vma != |
234 | (void *)page->mapping - PAGE_MAPPING_ANON) | 234 | (void *)page->mapping - PAGE_MAPPING_ANON) |
235 | return -EFAULT; | 235 | return -EFAULT; |
236 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 236 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
237 | if (!vma->vm_file || | 237 | if (!vma->vm_file || |
238 | vma->vm_file->f_mapping != page->mapping) | 238 | vma->vm_file->f_mapping != page->mapping) |
239 | return -EFAULT; | 239 | return -EFAULT; |
240 | } else | 240 | } else |
241 | return -EFAULT; | 241 | return -EFAULT; |
242 | return vma_address(page, vma); | 242 | return vma_address(page, vma); |
243 | } | 243 | } |
244 | 244 | ||
245 | /* | 245 | /* |
246 | * Check that @page is mapped at @address into @mm. | 246 | * Check that @page is mapped at @address into @mm. |
247 | * | 247 | * |
248 | * On success returns with pte mapped and locked. | 248 | * On success returns with pte mapped and locked. |
249 | */ | 249 | */ |
250 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 250 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
251 | unsigned long address, spinlock_t **ptlp) | 251 | unsigned long address, spinlock_t **ptlp) |
252 | { | 252 | { |
253 | pgd_t *pgd; | 253 | pgd_t *pgd; |
254 | pud_t *pud; | 254 | pud_t *pud; |
255 | pmd_t *pmd; | 255 | pmd_t *pmd; |
256 | pte_t *pte; | 256 | pte_t *pte; |
257 | spinlock_t *ptl; | 257 | spinlock_t *ptl; |
258 | 258 | ||
259 | pgd = pgd_offset(mm, address); | 259 | pgd = pgd_offset(mm, address); |
260 | if (!pgd_present(*pgd)) | 260 | if (!pgd_present(*pgd)) |
261 | return NULL; | 261 | return NULL; |
262 | 262 | ||
263 | pud = pud_offset(pgd, address); | 263 | pud = pud_offset(pgd, address); |
264 | if (!pud_present(*pud)) | 264 | if (!pud_present(*pud)) |
265 | return NULL; | 265 | return NULL; |
266 | 266 | ||
267 | pmd = pmd_offset(pud, address); | 267 | pmd = pmd_offset(pud, address); |
268 | if (!pmd_present(*pmd)) | 268 | if (!pmd_present(*pmd)) |
269 | return NULL; | 269 | return NULL; |
270 | 270 | ||
271 | pte = pte_offset_map(pmd, address); | 271 | pte = pte_offset_map(pmd, address); |
272 | /* Make a quick check before getting the lock */ | 272 | /* Make a quick check before getting the lock */ |
273 | if (!pte_present(*pte)) { | 273 | if (!pte_present(*pte)) { |
274 | pte_unmap(pte); | 274 | pte_unmap(pte); |
275 | return NULL; | 275 | return NULL; |
276 | } | 276 | } |
277 | 277 | ||
278 | ptl = pte_lockptr(mm, pmd); | 278 | ptl = pte_lockptr(mm, pmd); |
279 | spin_lock(ptl); | 279 | spin_lock(ptl); |
280 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { | 280 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { |
281 | *ptlp = ptl; | 281 | *ptlp = ptl; |
282 | return pte; | 282 | return pte; |
283 | } | 283 | } |
284 | pte_unmap_unlock(pte, ptl); | 284 | pte_unmap_unlock(pte, ptl); |
285 | return NULL; | 285 | return NULL; |
286 | } | 286 | } |
287 | 287 | ||
288 | /* | 288 | /* |
289 | * Subfunctions of page_referenced: page_referenced_one called | 289 | * Subfunctions of page_referenced: page_referenced_one called |
290 | * repeatedly from either page_referenced_anon or page_referenced_file. | 290 | * repeatedly from either page_referenced_anon or page_referenced_file. |
291 | */ | 291 | */ |
292 | static int page_referenced_one(struct page *page, | 292 | static int page_referenced_one(struct page *page, |
293 | struct vm_area_struct *vma, unsigned int *mapcount) | 293 | struct vm_area_struct *vma, unsigned int *mapcount) |
294 | { | 294 | { |
295 | struct mm_struct *mm = vma->vm_mm; | 295 | struct mm_struct *mm = vma->vm_mm; |
296 | unsigned long address; | 296 | unsigned long address; |
297 | pte_t *pte; | 297 | pte_t *pte; |
298 | spinlock_t *ptl; | 298 | spinlock_t *ptl; |
299 | int referenced = 0; | 299 | int referenced = 0; |
300 | 300 | ||
301 | address = vma_address(page, vma); | 301 | address = vma_address(page, vma); |
302 | if (address == -EFAULT) | 302 | if (address == -EFAULT) |
303 | goto out; | 303 | goto out; |
304 | 304 | ||
305 | pte = page_check_address(page, mm, address, &ptl); | 305 | pte = page_check_address(page, mm, address, &ptl); |
306 | if (!pte) | 306 | if (!pte) |
307 | goto out; | 307 | goto out; |
308 | 308 | ||
309 | if (ptep_clear_flush_young(vma, address, pte)) | 309 | if (ptep_clear_flush_young(vma, address, pte)) |
310 | referenced++; | 310 | referenced++; |
311 | 311 | ||
312 | /* Pretend the page is referenced if the task has the | 312 | /* Pretend the page is referenced if the task has the |
313 | swap token and is in the middle of a page fault. */ | 313 | swap token and is in the middle of a page fault. */ |
314 | if (mm != current->mm && has_swap_token(mm) && | 314 | if (mm != current->mm && has_swap_token(mm) && |
315 | rwsem_is_locked(&mm->mmap_sem)) | 315 | rwsem_is_locked(&mm->mmap_sem)) |
316 | referenced++; | 316 | referenced++; |
317 | 317 | ||
318 | (*mapcount)--; | 318 | (*mapcount)--; |
319 | pte_unmap_unlock(pte, ptl); | 319 | pte_unmap_unlock(pte, ptl); |
320 | out: | 320 | out: |
321 | return referenced; | 321 | return referenced; |
322 | } | 322 | } |
323 | 323 | ||
324 | static int page_referenced_anon(struct page *page) | 324 | static int page_referenced_anon(struct page *page) |
325 | { | 325 | { |
326 | unsigned int mapcount; | 326 | unsigned int mapcount; |
327 | struct anon_vma *anon_vma; | 327 | struct anon_vma *anon_vma; |
328 | struct vm_area_struct *vma; | 328 | struct vm_area_struct *vma; |
329 | int referenced = 0; | 329 | int referenced = 0; |
330 | 330 | ||
331 | anon_vma = page_lock_anon_vma(page); | 331 | anon_vma = page_lock_anon_vma(page); |
332 | if (!anon_vma) | 332 | if (!anon_vma) |
333 | return referenced; | 333 | return referenced; |
334 | 334 | ||
335 | mapcount = page_mapcount(page); | 335 | mapcount = page_mapcount(page); |
336 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 336 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
337 | referenced += page_referenced_one(page, vma, &mapcount); | 337 | referenced += page_referenced_one(page, vma, &mapcount); |
338 | if (!mapcount) | 338 | if (!mapcount) |
339 | break; | 339 | break; |
340 | } | 340 | } |
341 | spin_unlock(&anon_vma->lock); | 341 | spin_unlock(&anon_vma->lock); |
342 | return referenced; | 342 | return referenced; |
343 | } | 343 | } |
344 | 344 | ||
345 | /** | 345 | /** |
346 | * page_referenced_file - referenced check for object-based rmap | 346 | * page_referenced_file - referenced check for object-based rmap |
347 | * @page: the page we're checking references on. | 347 | * @page: the page we're checking references on. |
348 | * | 348 | * |
349 | * For an object-based mapped page, find all the places it is mapped and | 349 | * For an object-based mapped page, find all the places it is mapped and |
350 | * check/clear the referenced flag. This is done by following the page->mapping | 350 | * check/clear the referenced flag. This is done by following the page->mapping |
351 | * pointer, then walking the chain of vmas it holds. It returns the number | 351 | * pointer, then walking the chain of vmas it holds. It returns the number |
352 | * of references it found. | 352 | * of references it found. |
353 | * | 353 | * |
354 | * This function is only called from page_referenced for object-based pages. | 354 | * This function is only called from page_referenced for object-based pages. |
355 | */ | 355 | */ |
356 | static int page_referenced_file(struct page *page) | 356 | static int page_referenced_file(struct page *page) |
357 | { | 357 | { |
358 | unsigned int mapcount; | 358 | unsigned int mapcount; |
359 | struct address_space *mapping = page->mapping; | 359 | struct address_space *mapping = page->mapping; |
360 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 360 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
361 | struct vm_area_struct *vma; | 361 | struct vm_area_struct *vma; |
362 | struct prio_tree_iter iter; | 362 | struct prio_tree_iter iter; |
363 | int referenced = 0; | 363 | int referenced = 0; |
364 | 364 | ||
365 | /* | 365 | /* |
366 | * The caller's checks on page->mapping and !PageAnon have made | 366 | * The caller's checks on page->mapping and !PageAnon have made |
367 | * sure that this is a file page: the check for page->mapping | 367 | * sure that this is a file page: the check for page->mapping |
368 | * excludes the case just before it gets set on an anon page. | 368 | * excludes the case just before it gets set on an anon page. |
369 | */ | 369 | */ |
370 | BUG_ON(PageAnon(page)); | 370 | BUG_ON(PageAnon(page)); |
371 | 371 | ||
372 | /* | 372 | /* |
373 | * The page lock not only makes sure that page->mapping cannot | 373 | * The page lock not only makes sure that page->mapping cannot |
374 | * suddenly be NULLified by truncation, it makes sure that the | 374 | * suddenly be NULLified by truncation, it makes sure that the |
375 | * structure at mapping cannot be freed and reused yet, | 375 | * structure at mapping cannot be freed and reused yet, |
376 | * so we can safely take mapping->i_mmap_lock. | 376 | * so we can safely take mapping->i_mmap_lock. |
377 | */ | 377 | */ |
378 | BUG_ON(!PageLocked(page)); | 378 | BUG_ON(!PageLocked(page)); |
379 | 379 | ||
380 | spin_lock(&mapping->i_mmap_lock); | 380 | spin_lock(&mapping->i_mmap_lock); |
381 | 381 | ||
382 | /* | 382 | /* |
383 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | 383 | * i_mmap_lock does not stabilize mapcount at all, but mapcount |
384 | * is more likely to be accurate if we note it after spinning. | 384 | * is more likely to be accurate if we note it after spinning. |
385 | */ | 385 | */ |
386 | mapcount = page_mapcount(page); | 386 | mapcount = page_mapcount(page); |
387 | 387 | ||
388 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 388 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
389 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | 389 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) |
390 | == (VM_LOCKED|VM_MAYSHARE)) { | 390 | == (VM_LOCKED|VM_MAYSHARE)) { |
391 | referenced++; | 391 | referenced++; |
392 | break; | 392 | break; |
393 | } | 393 | } |
394 | referenced += page_referenced_one(page, vma, &mapcount); | 394 | referenced += page_referenced_one(page, vma, &mapcount); |
395 | if (!mapcount) | 395 | if (!mapcount) |
396 | break; | 396 | break; |
397 | } | 397 | } |
398 | 398 | ||
399 | spin_unlock(&mapping->i_mmap_lock); | 399 | spin_unlock(&mapping->i_mmap_lock); |
400 | return referenced; | 400 | return referenced; |
401 | } | 401 | } |
402 | 402 | ||
403 | /** | 403 | /** |
404 | * page_referenced - test if the page was referenced | 404 | * page_referenced - test if the page was referenced |
405 | * @page: the page to test | 405 | * @page: the page to test |
406 | * @is_locked: caller holds lock on the page | 406 | * @is_locked: caller holds lock on the page |
407 | * | 407 | * |
408 | * Quick test_and_clear_referenced for all mappings to a page, | 408 | * Quick test_and_clear_referenced for all mappings to a page, |
409 | * returns the number of ptes which referenced the page. | 409 | * returns the number of ptes which referenced the page. |
410 | */ | 410 | */ |
411 | int page_referenced(struct page *page, int is_locked) | 411 | int page_referenced(struct page *page, int is_locked) |
412 | { | 412 | { |
413 | int referenced = 0; | 413 | int referenced = 0; |
414 | 414 | ||
415 | if (page_test_and_clear_young(page)) | 415 | if (page_test_and_clear_young(page)) |
416 | referenced++; | 416 | referenced++; |
417 | 417 | ||
418 | if (TestClearPageReferenced(page)) | 418 | if (TestClearPageReferenced(page)) |
419 | referenced++; | 419 | referenced++; |
420 | 420 | ||
421 | if (page_mapped(page) && page->mapping) { | 421 | if (page_mapped(page) && page->mapping) { |
422 | if (PageAnon(page)) | 422 | if (PageAnon(page)) |
423 | referenced += page_referenced_anon(page); | 423 | referenced += page_referenced_anon(page); |
424 | else if (is_locked) | 424 | else if (is_locked) |
425 | referenced += page_referenced_file(page); | 425 | referenced += page_referenced_file(page); |
426 | else if (TestSetPageLocked(page)) | 426 | else if (TestSetPageLocked(page)) |
427 | referenced++; | 427 | referenced++; |
428 | else { | 428 | else { |
429 | if (page->mapping) | 429 | if (page->mapping) |
430 | referenced += page_referenced_file(page); | 430 | referenced += page_referenced_file(page); |
431 | unlock_page(page); | 431 | unlock_page(page); |
432 | } | 432 | } |
433 | } | 433 | } |
434 | return referenced; | 434 | return referenced; |
435 | } | 435 | } |
436 | 436 | ||
437 | /** | 437 | /** |
438 | * page_set_anon_rmap - setup new anonymous rmap | 438 | * page_set_anon_rmap - setup new anonymous rmap |
439 | * @page: the page to add the mapping to | 439 | * @page: the page to add the mapping to |
440 | * @vma: the vm area in which the mapping is added | 440 | * @vma: the vm area in which the mapping is added |
441 | * @address: the user virtual address mapped | 441 | * @address: the user virtual address mapped |
442 | */ | 442 | */ |
443 | static void __page_set_anon_rmap(struct page *page, | 443 | static void __page_set_anon_rmap(struct page *page, |
444 | struct vm_area_struct *vma, unsigned long address) | 444 | struct vm_area_struct *vma, unsigned long address) |
445 | { | 445 | { |
446 | struct anon_vma *anon_vma = vma->anon_vma; | 446 | struct anon_vma *anon_vma = vma->anon_vma; |
447 | 447 | ||
448 | BUG_ON(!anon_vma); | 448 | BUG_ON(!anon_vma); |
449 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 449 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
450 | page->mapping = (struct address_space *) anon_vma; | 450 | page->mapping = (struct address_space *) anon_vma; |
451 | 451 | ||
452 | page->index = linear_page_index(vma, address); | 452 | page->index = linear_page_index(vma, address); |
453 | 453 | ||
454 | /* | 454 | /* |
455 | * nr_mapped state can be updated without turning off | 455 | * nr_mapped state can be updated without turning off |
456 | * interrupts because it is not modified via interrupt. | 456 | * interrupts because it is not modified via interrupt. |
457 | */ | 457 | */ |
458 | __inc_page_state(nr_mapped); | 458 | __inc_page_state(nr_mapped); |
459 | } | 459 | } |
460 | 460 | ||
461 | /** | 461 | /** |
462 | * page_add_anon_rmap - add pte mapping to an anonymous page | 462 | * page_add_anon_rmap - add pte mapping to an anonymous page |
463 | * @page: the page to add the mapping to | 463 | * @page: the page to add the mapping to |
464 | * @vma: the vm area in which the mapping is added | 464 | * @vma: the vm area in which the mapping is added |
465 | * @address: the user virtual address mapped | 465 | * @address: the user virtual address mapped |
466 | * | 466 | * |
467 | * The caller needs to hold the pte lock. | 467 | * The caller needs to hold the pte lock. |
468 | */ | 468 | */ |
469 | void page_add_anon_rmap(struct page *page, | 469 | void page_add_anon_rmap(struct page *page, |
470 | struct vm_area_struct *vma, unsigned long address) | 470 | struct vm_area_struct *vma, unsigned long address) |
471 | { | 471 | { |
472 | if (atomic_inc_and_test(&page->_mapcount)) | 472 | if (atomic_inc_and_test(&page->_mapcount)) |
473 | __page_set_anon_rmap(page, vma, address); | 473 | __page_set_anon_rmap(page, vma, address); |
474 | /* else checking page index and mapping is racy */ | 474 | /* else checking page index and mapping is racy */ |
475 | } | 475 | } |
476 | 476 | ||
477 | /* | 477 | /* |
478 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page | 478 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page |
479 | * @page: the page to add the mapping to | 479 | * @page: the page to add the mapping to |
480 | * @vma: the vm area in which the mapping is added | 480 | * @vma: the vm area in which the mapping is added |
481 | * @address: the user virtual address mapped | 481 | * @address: the user virtual address mapped |
482 | * | 482 | * |
483 | * Same as page_add_anon_rmap but must only be called on *new* pages. | 483 | * Same as page_add_anon_rmap but must only be called on *new* pages. |
484 | * This means the inc-and-test can be bypassed. | 484 | * This means the inc-and-test can be bypassed. |
485 | */ | 485 | */ |
486 | void page_add_new_anon_rmap(struct page *page, | 486 | void page_add_new_anon_rmap(struct page *page, |
487 | struct vm_area_struct *vma, unsigned long address) | 487 | struct vm_area_struct *vma, unsigned long address) |
488 | { | 488 | { |
489 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 489 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ |
490 | __page_set_anon_rmap(page, vma, address); | 490 | __page_set_anon_rmap(page, vma, address); |
491 | } | 491 | } |
492 | 492 | ||
493 | /** | 493 | /** |
494 | * page_add_file_rmap - add pte mapping to a file page | 494 | * page_add_file_rmap - add pte mapping to a file page |
495 | * @page: the page to add the mapping to | 495 | * @page: the page to add the mapping to |
496 | * | 496 | * |
497 | * The caller needs to hold the pte lock. | 497 | * The caller needs to hold the pte lock. |
498 | */ | 498 | */ |
499 | void page_add_file_rmap(struct page *page) | 499 | void page_add_file_rmap(struct page *page) |
500 | { | 500 | { |
501 | if (atomic_inc_and_test(&page->_mapcount)) | 501 | if (atomic_inc_and_test(&page->_mapcount)) |
502 | __inc_page_state(nr_mapped); | 502 | __inc_page_state(nr_mapped); |
503 | } | 503 | } |
504 | 504 | ||
505 | /** | 505 | /** |
506 | * page_remove_rmap - take down pte mapping from a page | 506 | * page_remove_rmap - take down pte mapping from a page |
507 | * @page: page to remove mapping from | 507 | * @page: page to remove mapping from |
508 | * | 508 | * |
509 | * The caller needs to hold the pte lock. | 509 | * The caller needs to hold the pte lock. |
510 | */ | 510 | */ |
511 | void page_remove_rmap(struct page *page) | 511 | void page_remove_rmap(struct page *page) |
512 | { | 512 | { |
513 | if (atomic_add_negative(-1, &page->_mapcount)) { | 513 | if (atomic_add_negative(-1, &page->_mapcount)) { |
514 | #ifdef CONFIG_DEBUG_VM | 514 | #ifdef CONFIG_DEBUG_VM |
515 | if (unlikely(page_mapcount(page) < 0)) { | 515 | if (unlikely(page_mapcount(page) < 0)) { |
516 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 516 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
517 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 517 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
518 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 518 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
519 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 519 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
520 | } | 520 | } |
521 | #endif | 521 | #endif |
522 | BUG_ON(page_mapcount(page) < 0); | 522 | BUG_ON(page_mapcount(page) < 0); |
523 | /* | 523 | /* |
524 | * It would be tidy to reset the PageAnon mapping here, | 524 | * It would be tidy to reset the PageAnon mapping here, |
525 | * but that might overwrite a racing page_add_anon_rmap | 525 | * but that might overwrite a racing page_add_anon_rmap |
526 | * which increments mapcount after us but sets mapping | 526 | * which increments mapcount after us but sets mapping |
527 | * before us: so leave the reset to free_hot_cold_page, | 527 | * before us: so leave the reset to free_hot_cold_page, |
528 | * and remember that it's only reliable while mapped. | 528 | * and remember that it's only reliable while mapped. |
529 | * Leaving it set also helps swapoff to reinstate ptes | 529 | * Leaving it set also helps swapoff to reinstate ptes |
530 | * faster for those pages still in swapcache. | 530 | * faster for those pages still in swapcache. |
531 | */ | 531 | */ |
532 | if (page_test_and_clear_dirty(page)) | 532 | if (page_test_and_clear_dirty(page)) |
533 | set_page_dirty(page); | 533 | set_page_dirty(page); |
534 | __dec_page_state(nr_mapped); | 534 | __dec_page_state(nr_mapped); |
535 | } | 535 | } |
536 | } | 536 | } |
537 | 537 | ||
538 | /* | 538 | /* |
539 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 539 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
540 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 540 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
541 | */ | 541 | */ |
542 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 542 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
543 | int migration) | 543 | int migration) |
544 | { | 544 | { |
545 | struct mm_struct *mm = vma->vm_mm; | 545 | struct mm_struct *mm = vma->vm_mm; |
546 | unsigned long address; | 546 | unsigned long address; |
547 | pte_t *pte; | 547 | pte_t *pte; |
548 | pte_t pteval; | 548 | pte_t pteval; |
549 | spinlock_t *ptl; | 549 | spinlock_t *ptl; |
550 | int ret = SWAP_AGAIN; | 550 | int ret = SWAP_AGAIN; |
551 | 551 | ||
552 | address = vma_address(page, vma); | 552 | address = vma_address(page, vma); |
553 | if (address == -EFAULT) | 553 | if (address == -EFAULT) |
554 | goto out; | 554 | goto out; |
555 | 555 | ||
556 | pte = page_check_address(page, mm, address, &ptl); | 556 | pte = page_check_address(page, mm, address, &ptl); |
557 | if (!pte) | 557 | if (!pte) |
558 | goto out; | 558 | goto out; |
559 | 559 | ||
560 | /* | 560 | /* |
561 | * If the page is mlock()d, we cannot swap it out. | 561 | * If the page is mlock()d, we cannot swap it out. |
562 | * If it's recently referenced (perhaps page_referenced | 562 | * If it's recently referenced (perhaps page_referenced |
563 | * skipped over this mm) then we should reactivate it. | 563 | * skipped over this mm) then we should reactivate it. |
564 | */ | 564 | */ |
565 | if ((vma->vm_flags & VM_LOCKED) || | 565 | if ((vma->vm_flags & VM_LOCKED) || |
566 | (ptep_clear_flush_young(vma, address, pte) | 566 | (ptep_clear_flush_young(vma, address, pte) |
567 | && !migration)) { | 567 | && !migration)) { |
568 | ret = SWAP_FAIL; | 568 | ret = SWAP_FAIL; |
569 | goto out_unmap; | 569 | goto out_unmap; |
570 | } | 570 | } |
571 | 571 | ||
572 | /* Nuke the page table entry. */ | 572 | /* Nuke the page table entry. */ |
573 | flush_cache_page(vma, address, page_to_pfn(page)); | 573 | flush_cache_page(vma, address, page_to_pfn(page)); |
574 | pteval = ptep_clear_flush(vma, address, pte); | 574 | pteval = ptep_clear_flush(vma, address, pte); |
575 | 575 | ||
576 | /* Move the dirty bit to the physical page now the pte is gone. */ | 576 | /* Move the dirty bit to the physical page now the pte is gone. */ |
577 | if (pte_dirty(pteval)) | 577 | if (pte_dirty(pteval)) |
578 | set_page_dirty(page); | 578 | set_page_dirty(page); |
579 | 579 | ||
580 | /* Update high watermark before we lower rss */ | 580 | /* Update high watermark before we lower rss */ |
581 | update_hiwater_rss(mm); | 581 | update_hiwater_rss(mm); |
582 | 582 | ||
583 | if (PageAnon(page)) { | 583 | if (PageAnon(page)) { |
584 | swp_entry_t entry = { .val = page_private(page) }; | 584 | swp_entry_t entry = { .val = page_private(page) }; |
585 | 585 | ||
586 | if (PageSwapCache(page)) { | 586 | if (PageSwapCache(page)) { |
587 | /* | 587 | /* |
588 | * Store the swap location in the pte. | 588 | * Store the swap location in the pte. |
589 | * See handle_pte_fault() ... | 589 | * See handle_pte_fault() ... |
590 | */ | 590 | */ |
591 | swap_duplicate(entry); | 591 | swap_duplicate(entry); |
592 | if (list_empty(&mm->mmlist)) { | 592 | if (list_empty(&mm->mmlist)) { |
593 | spin_lock(&mmlist_lock); | 593 | spin_lock(&mmlist_lock); |
594 | if (list_empty(&mm->mmlist)) | 594 | if (list_empty(&mm->mmlist)) |
595 | list_add(&mm->mmlist, &init_mm.mmlist); | 595 | list_add(&mm->mmlist, &init_mm.mmlist); |
596 | spin_unlock(&mmlist_lock); | 596 | spin_unlock(&mmlist_lock); |
597 | } | 597 | } |
598 | dec_mm_counter(mm, anon_rss); | 598 | dec_mm_counter(mm, anon_rss); |
599 | #ifdef CONFIG_MIGRATION | ||
599 | } else { | 600 | } else { |
600 | /* | 601 | /* |
601 | * Store the pfn of the page in a special migration | 602 | * Store the pfn of the page in a special migration |
602 | * pte. do_swap_page() will wait until the migration | 603 | * pte. do_swap_page() will wait until the migration |
603 | * pte is removed and then restart fault handling. | 604 | * pte is removed and then restart fault handling. |
604 | */ | 605 | */ |
605 | BUG_ON(!migration); | 606 | BUG_ON(!migration); |
606 | entry = make_migration_entry(page, pte_write(pteval)); | 607 | entry = make_migration_entry(page, pte_write(pteval)); |
608 | #endif | ||
607 | } | 609 | } |
608 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 610 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
609 | BUG_ON(pte_file(*pte)); | 611 | BUG_ON(pte_file(*pte)); |
610 | } else | 612 | } else |
613 | #ifdef CONFIG_MIGRATION | ||
614 | if (migration) { | ||
615 | /* Establish migration entry for a file page */ | ||
616 | swp_entry_t entry; | ||
617 | entry = make_migration_entry(page, pte_write(pteval)); | ||
618 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
619 | } else | ||
620 | #endif | ||
611 | dec_mm_counter(mm, file_rss); | 621 | dec_mm_counter(mm, file_rss); |
622 | |||
612 | 623 | ||
613 | page_remove_rmap(page); | 624 | page_remove_rmap(page); |
614 | page_cache_release(page); | 625 | page_cache_release(page); |
615 | 626 | ||
616 | out_unmap: | 627 | out_unmap: |
617 | pte_unmap_unlock(pte, ptl); | 628 | pte_unmap_unlock(pte, ptl); |
618 | out: | 629 | out: |
619 | return ret; | 630 | return ret; |
620 | } | 631 | } |
621 | 632 | ||
622 | /* | 633 | /* |
623 | * objrmap doesn't work for nonlinear VMAs because the assumption that | 634 | * objrmap doesn't work for nonlinear VMAs because the assumption that |
624 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | 635 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. |
625 | * Consequently, given a particular page and its ->index, we cannot locate the | 636 | * Consequently, given a particular page and its ->index, we cannot locate the |
626 | * ptes which are mapping that page without an exhaustive linear search. | 637 | * ptes which are mapping that page without an exhaustive linear search. |
627 | * | 638 | * |
628 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | 639 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which |
629 | * maps the file to which the target page belongs. The ->vm_private_data field | 640 | * maps the file to which the target page belongs. The ->vm_private_data field |
630 | * holds the current cursor into that scan. Successive searches will circulate | 641 | * holds the current cursor into that scan. Successive searches will circulate |
631 | * around the vma's virtual address space. | 642 | * around the vma's virtual address space. |
632 | * | 643 | * |
633 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | 644 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, |
634 | * more scanning pressure is placed against them as well. Eventually pages | 645 | * more scanning pressure is placed against them as well. Eventually pages |
635 | * will become fully unmapped and are eligible for eviction. | 646 | * will become fully unmapped and are eligible for eviction. |
636 | * | 647 | * |
637 | * For very sparsely populated VMAs this is a little inefficient - chances are | 648 | * For very sparsely populated VMAs this is a little inefficient - chances are |
638 | * there there won't be many ptes located within the scan cluster. In this case | 649 | * there there won't be many ptes located within the scan cluster. In this case |
639 | * maybe we could scan further - to the end of the pte page, perhaps. | 650 | * maybe we could scan further - to the end of the pte page, perhaps. |
640 | */ | 651 | */ |
641 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | 652 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) |
642 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | 653 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) |
643 | 654 | ||
644 | static void try_to_unmap_cluster(unsigned long cursor, | 655 | static void try_to_unmap_cluster(unsigned long cursor, |
645 | unsigned int *mapcount, struct vm_area_struct *vma) | 656 | unsigned int *mapcount, struct vm_area_struct *vma) |
646 | { | 657 | { |
647 | struct mm_struct *mm = vma->vm_mm; | 658 | struct mm_struct *mm = vma->vm_mm; |
648 | pgd_t *pgd; | 659 | pgd_t *pgd; |
649 | pud_t *pud; | 660 | pud_t *pud; |
650 | pmd_t *pmd; | 661 | pmd_t *pmd; |
651 | pte_t *pte; | 662 | pte_t *pte; |
652 | pte_t pteval; | 663 | pte_t pteval; |
653 | spinlock_t *ptl; | 664 | spinlock_t *ptl; |
654 | struct page *page; | 665 | struct page *page; |
655 | unsigned long address; | 666 | unsigned long address; |
656 | unsigned long end; | 667 | unsigned long end; |
657 | 668 | ||
658 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 669 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
659 | end = address + CLUSTER_SIZE; | 670 | end = address + CLUSTER_SIZE; |
660 | if (address < vma->vm_start) | 671 | if (address < vma->vm_start) |
661 | address = vma->vm_start; | 672 | address = vma->vm_start; |
662 | if (end > vma->vm_end) | 673 | if (end > vma->vm_end) |
663 | end = vma->vm_end; | 674 | end = vma->vm_end; |
664 | 675 | ||
665 | pgd = pgd_offset(mm, address); | 676 | pgd = pgd_offset(mm, address); |
666 | if (!pgd_present(*pgd)) | 677 | if (!pgd_present(*pgd)) |
667 | return; | 678 | return; |
668 | 679 | ||
669 | pud = pud_offset(pgd, address); | 680 | pud = pud_offset(pgd, address); |
670 | if (!pud_present(*pud)) | 681 | if (!pud_present(*pud)) |
671 | return; | 682 | return; |
672 | 683 | ||
673 | pmd = pmd_offset(pud, address); | 684 | pmd = pmd_offset(pud, address); |
674 | if (!pmd_present(*pmd)) | 685 | if (!pmd_present(*pmd)) |
675 | return; | 686 | return; |
676 | 687 | ||
677 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 688 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
678 | 689 | ||
679 | /* Update high watermark before we lower rss */ | 690 | /* Update high watermark before we lower rss */ |
680 | update_hiwater_rss(mm); | 691 | update_hiwater_rss(mm); |
681 | 692 | ||
682 | for (; address < end; pte++, address += PAGE_SIZE) { | 693 | for (; address < end; pte++, address += PAGE_SIZE) { |
683 | if (!pte_present(*pte)) | 694 | if (!pte_present(*pte)) |
684 | continue; | 695 | continue; |
685 | page = vm_normal_page(vma, address, *pte); | 696 | page = vm_normal_page(vma, address, *pte); |
686 | BUG_ON(!page || PageAnon(page)); | 697 | BUG_ON(!page || PageAnon(page)); |
687 | 698 | ||
688 | if (ptep_clear_flush_young(vma, address, pte)) | 699 | if (ptep_clear_flush_young(vma, address, pte)) |
689 | continue; | 700 | continue; |
690 | 701 | ||
691 | /* Nuke the page table entry. */ | 702 | /* Nuke the page table entry. */ |
692 | flush_cache_page(vma, address, pte_pfn(*pte)); | 703 | flush_cache_page(vma, address, pte_pfn(*pte)); |
693 | pteval = ptep_clear_flush(vma, address, pte); | 704 | pteval = ptep_clear_flush(vma, address, pte); |
694 | 705 | ||
695 | /* If nonlinear, store the file page offset in the pte. */ | 706 | /* If nonlinear, store the file page offset in the pte. */ |
696 | if (page->index != linear_page_index(vma, address)) | 707 | if (page->index != linear_page_index(vma, address)) |
697 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); | 708 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); |
698 | 709 | ||
699 | /* Move the dirty bit to the physical page now the pte is gone. */ | 710 | /* Move the dirty bit to the physical page now the pte is gone. */ |
700 | if (pte_dirty(pteval)) | 711 | if (pte_dirty(pteval)) |
701 | set_page_dirty(page); | 712 | set_page_dirty(page); |
702 | 713 | ||
703 | page_remove_rmap(page); | 714 | page_remove_rmap(page); |
704 | page_cache_release(page); | 715 | page_cache_release(page); |
705 | dec_mm_counter(mm, file_rss); | 716 | dec_mm_counter(mm, file_rss); |
706 | (*mapcount)--; | 717 | (*mapcount)--; |
707 | } | 718 | } |
708 | pte_unmap_unlock(pte - 1, ptl); | 719 | pte_unmap_unlock(pte - 1, ptl); |
709 | } | 720 | } |
710 | 721 | ||
711 | static int try_to_unmap_anon(struct page *page, int migration) | 722 | static int try_to_unmap_anon(struct page *page, int migration) |
712 | { | 723 | { |
713 | struct anon_vma *anon_vma; | 724 | struct anon_vma *anon_vma; |
714 | struct vm_area_struct *vma; | 725 | struct vm_area_struct *vma; |
715 | int ret = SWAP_AGAIN; | 726 | int ret = SWAP_AGAIN; |
716 | 727 | ||
717 | anon_vma = page_lock_anon_vma(page); | 728 | anon_vma = page_lock_anon_vma(page); |
718 | if (!anon_vma) | 729 | if (!anon_vma) |
719 | return ret; | 730 | return ret; |
720 | 731 | ||
721 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 732 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
722 | ret = try_to_unmap_one(page, vma, migration); | 733 | ret = try_to_unmap_one(page, vma, migration); |
723 | if (ret == SWAP_FAIL || !page_mapped(page)) | 734 | if (ret == SWAP_FAIL || !page_mapped(page)) |
724 | break; | 735 | break; |
725 | } | 736 | } |
726 | spin_unlock(&anon_vma->lock); | 737 | spin_unlock(&anon_vma->lock); |
727 | return ret; | 738 | return ret; |
728 | } | 739 | } |
729 | 740 | ||
730 | /** | 741 | /** |
731 | * try_to_unmap_file - unmap file page using the object-based rmap method | 742 | * try_to_unmap_file - unmap file page using the object-based rmap method |
732 | * @page: the page to unmap | 743 | * @page: the page to unmap |
733 | * | 744 | * |
734 | * Find all the mappings of a page using the mapping pointer and the vma chains | 745 | * Find all the mappings of a page using the mapping pointer and the vma chains |
735 | * contained in the address_space struct it points to. | 746 | * contained in the address_space struct it points to. |
736 | * | 747 | * |
737 | * This function is only called from try_to_unmap for object-based pages. | 748 | * This function is only called from try_to_unmap for object-based pages. |
738 | */ | 749 | */ |
739 | static int try_to_unmap_file(struct page *page, int migration) | 750 | static int try_to_unmap_file(struct page *page, int migration) |
740 | { | 751 | { |
741 | struct address_space *mapping = page->mapping; | 752 | struct address_space *mapping = page->mapping; |
742 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 753 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
743 | struct vm_area_struct *vma; | 754 | struct vm_area_struct *vma; |
744 | struct prio_tree_iter iter; | 755 | struct prio_tree_iter iter; |
745 | int ret = SWAP_AGAIN; | 756 | int ret = SWAP_AGAIN; |
746 | unsigned long cursor; | 757 | unsigned long cursor; |
747 | unsigned long max_nl_cursor = 0; | 758 | unsigned long max_nl_cursor = 0; |
748 | unsigned long max_nl_size = 0; | 759 | unsigned long max_nl_size = 0; |
749 | unsigned int mapcount; | 760 | unsigned int mapcount; |
750 | 761 | ||
751 | spin_lock(&mapping->i_mmap_lock); | 762 | spin_lock(&mapping->i_mmap_lock); |
752 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 763 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
753 | ret = try_to_unmap_one(page, vma, migration); | 764 | ret = try_to_unmap_one(page, vma, migration); |
754 | if (ret == SWAP_FAIL || !page_mapped(page)) | 765 | if (ret == SWAP_FAIL || !page_mapped(page)) |
755 | goto out; | 766 | goto out; |
756 | } | 767 | } |
757 | 768 | ||
758 | if (list_empty(&mapping->i_mmap_nonlinear)) | 769 | if (list_empty(&mapping->i_mmap_nonlinear)) |
759 | goto out; | 770 | goto out; |
760 | 771 | ||
761 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 772 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
762 | shared.vm_set.list) { | 773 | shared.vm_set.list) { |
763 | if (vma->vm_flags & VM_LOCKED) | 774 | if (vma->vm_flags & VM_LOCKED) |
764 | continue; | 775 | continue; |
765 | cursor = (unsigned long) vma->vm_private_data; | 776 | cursor = (unsigned long) vma->vm_private_data; |
766 | if (cursor > max_nl_cursor) | 777 | if (cursor > max_nl_cursor) |
767 | max_nl_cursor = cursor; | 778 | max_nl_cursor = cursor; |
768 | cursor = vma->vm_end - vma->vm_start; | 779 | cursor = vma->vm_end - vma->vm_start; |
769 | if (cursor > max_nl_size) | 780 | if (cursor > max_nl_size) |
770 | max_nl_size = cursor; | 781 | max_nl_size = cursor; |
771 | } | 782 | } |
772 | 783 | ||
773 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ | 784 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ |
774 | ret = SWAP_FAIL; | 785 | ret = SWAP_FAIL; |
775 | goto out; | 786 | goto out; |
776 | } | 787 | } |
777 | 788 | ||
778 | /* | 789 | /* |
779 | * We don't try to search for this page in the nonlinear vmas, | 790 | * We don't try to search for this page in the nonlinear vmas, |
780 | * and page_referenced wouldn't have found it anyway. Instead | 791 | * and page_referenced wouldn't have found it anyway. Instead |
781 | * just walk the nonlinear vmas trying to age and unmap some. | 792 | * just walk the nonlinear vmas trying to age and unmap some. |
782 | * The mapcount of the page we came in with is irrelevant, | 793 | * The mapcount of the page we came in with is irrelevant, |
783 | * but even so use it as a guide to how hard we should try? | 794 | * but even so use it as a guide to how hard we should try? |
784 | */ | 795 | */ |
785 | mapcount = page_mapcount(page); | 796 | mapcount = page_mapcount(page); |
786 | if (!mapcount) | 797 | if (!mapcount) |
787 | goto out; | 798 | goto out; |
788 | cond_resched_lock(&mapping->i_mmap_lock); | 799 | cond_resched_lock(&mapping->i_mmap_lock); |
789 | 800 | ||
790 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 801 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
791 | if (max_nl_cursor == 0) | 802 | if (max_nl_cursor == 0) |
792 | max_nl_cursor = CLUSTER_SIZE; | 803 | max_nl_cursor = CLUSTER_SIZE; |
793 | 804 | ||
794 | do { | 805 | do { |
795 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 806 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
796 | shared.vm_set.list) { | 807 | shared.vm_set.list) { |
797 | if (vma->vm_flags & VM_LOCKED) | 808 | if (vma->vm_flags & VM_LOCKED) |
798 | continue; | 809 | continue; |
799 | cursor = (unsigned long) vma->vm_private_data; | 810 | cursor = (unsigned long) vma->vm_private_data; |
800 | while ( cursor < max_nl_cursor && | 811 | while ( cursor < max_nl_cursor && |
801 | cursor < vma->vm_end - vma->vm_start) { | 812 | cursor < vma->vm_end - vma->vm_start) { |
802 | try_to_unmap_cluster(cursor, &mapcount, vma); | 813 | try_to_unmap_cluster(cursor, &mapcount, vma); |
803 | cursor += CLUSTER_SIZE; | 814 | cursor += CLUSTER_SIZE; |
804 | vma->vm_private_data = (void *) cursor; | 815 | vma->vm_private_data = (void *) cursor; |
805 | if ((int)mapcount <= 0) | 816 | if ((int)mapcount <= 0) |
806 | goto out; | 817 | goto out; |
807 | } | 818 | } |
808 | vma->vm_private_data = (void *) max_nl_cursor; | 819 | vma->vm_private_data = (void *) max_nl_cursor; |
809 | } | 820 | } |
810 | cond_resched_lock(&mapping->i_mmap_lock); | 821 | cond_resched_lock(&mapping->i_mmap_lock); |
811 | max_nl_cursor += CLUSTER_SIZE; | 822 | max_nl_cursor += CLUSTER_SIZE; |
812 | } while (max_nl_cursor <= max_nl_size); | 823 | } while (max_nl_cursor <= max_nl_size); |
813 | 824 | ||
814 | /* | 825 | /* |
815 | * Don't loop forever (perhaps all the remaining pages are | 826 | * Don't loop forever (perhaps all the remaining pages are |
816 | * in locked vmas). Reset cursor on all unreserved nonlinear | 827 | * in locked vmas). Reset cursor on all unreserved nonlinear |
817 | * vmas, now forgetting on which ones it had fallen behind. | 828 | * vmas, now forgetting on which ones it had fallen behind. |
818 | */ | 829 | */ |
819 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 830 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
820 | vma->vm_private_data = NULL; | 831 | vma->vm_private_data = NULL; |
821 | out: | 832 | out: |
822 | spin_unlock(&mapping->i_mmap_lock); | 833 | spin_unlock(&mapping->i_mmap_lock); |
823 | return ret; | 834 | return ret; |
824 | } | 835 | } |
825 | 836 | ||
826 | /** | 837 | /** |
827 | * try_to_unmap - try to remove all page table mappings to a page | 838 | * try_to_unmap - try to remove all page table mappings to a page |
828 | * @page: the page to get unmapped | 839 | * @page: the page to get unmapped |
829 | * | 840 | * |
830 | * Tries to remove all the page table entries which are mapping this | 841 | * Tries to remove all the page table entries which are mapping this |
831 | * page, used in the pageout path. Caller must hold the page lock. | 842 | * page, used in the pageout path. Caller must hold the page lock. |
832 | * Return values are: | 843 | * Return values are: |
833 | * | 844 | * |
834 | * SWAP_SUCCESS - we succeeded in removing all mappings | 845 | * SWAP_SUCCESS - we succeeded in removing all mappings |
835 | * SWAP_AGAIN - we missed a mapping, try again later | 846 | * SWAP_AGAIN - we missed a mapping, try again later |
836 | * SWAP_FAIL - the page is unswappable | 847 | * SWAP_FAIL - the page is unswappable |
837 | */ | 848 | */ |
838 | int try_to_unmap(struct page *page, int migration) | 849 | int try_to_unmap(struct page *page, int migration) |
839 | { | 850 | { |
840 | int ret; | 851 | int ret; |
841 | 852 | ||
842 | BUG_ON(!PageLocked(page)); | 853 | BUG_ON(!PageLocked(page)); |
843 | 854 | ||
844 | if (PageAnon(page)) | 855 | if (PageAnon(page)) |
845 | ret = try_to_unmap_anon(page, migration); | 856 | ret = try_to_unmap_anon(page, migration); |
846 | else | 857 | else |
847 | ret = try_to_unmap_file(page, migration); | 858 | ret = try_to_unmap_file(page, migration); |
848 | 859 | ||
849 | if (!page_mapped(page)) | 860 | if (!page_mapped(page)) |
850 | ret = SWAP_SUCCESS; | 861 | ret = SWAP_SUCCESS; |
851 | return ret; | 862 | return ret; |
852 | } | 863 | } |
853 | 864 | ||
854 | 865 |
mm/vmscan.c
1 | /* | 1 | /* |
2 | * linux/mm/vmscan.c | 2 | * linux/mm/vmscan.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
5 | * | 5 | * |
6 | * Swap reorganised 29.12.95, Stephen Tweedie. | 6 | * Swap reorganised 29.12.95, Stephen Tweedie. |
7 | * kswapd added: 7.1.96 sct | 7 | * kswapd added: 7.1.96 sct |
8 | * Removed kswapd_ctl limits, and swap out as many pages as needed | 8 | * Removed kswapd_ctl limits, and swap out as many pages as needed |
9 | * to bring the system back to freepages.high: 2.4.97, Rik van Riel. | 9 | * to bring the system back to freepages.high: 2.4.97, Rik van Riel. |
10 | * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). | 10 | * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). |
11 | * Multiqueue VM started 5.8.00, Rik van Riel. | 11 | * Multiqueue VM started 5.8.00, Rik van Riel. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/file.h> | 22 | #include <linux/file.h> |
23 | #include <linux/writeback.h> | 23 | #include <linux/writeback.h> |
24 | #include <linux/blkdev.h> | 24 | #include <linux/blkdev.h> |
25 | #include <linux/buffer_head.h> /* for try_to_release_page(), | 25 | #include <linux/buffer_head.h> /* for try_to_release_page(), |
26 | buffer_heads_over_limit */ | 26 | buffer_heads_over_limit */ |
27 | #include <linux/mm_inline.h> | 27 | #include <linux/mm_inline.h> |
28 | #include <linux/pagevec.h> | 28 | #include <linux/pagevec.h> |
29 | #include <linux/backing-dev.h> | 29 | #include <linux/backing-dev.h> |
30 | #include <linux/rmap.h> | 30 | #include <linux/rmap.h> |
31 | #include <linux/topology.h> | 31 | #include <linux/topology.h> |
32 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | 37 | ||
38 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
39 | #include <asm/div64.h> | 39 | #include <asm/div64.h> |
40 | 40 | ||
41 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
42 | 42 | ||
43 | #include "internal.h" | 43 | #include "internal.h" |
44 | 44 | ||
45 | struct scan_control { | 45 | struct scan_control { |
46 | /* Incremented by the number of inactive pages that were scanned */ | 46 | /* Incremented by the number of inactive pages that were scanned */ |
47 | unsigned long nr_scanned; | 47 | unsigned long nr_scanned; |
48 | 48 | ||
49 | unsigned long nr_mapped; /* From page_state */ | 49 | unsigned long nr_mapped; /* From page_state */ |
50 | 50 | ||
51 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ |
52 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; |
53 | 53 | ||
54 | int may_writepage; | 54 | int may_writepage; |
55 | 55 | ||
56 | /* Can pages be swapped as part of reclaim? */ | 56 | /* Can pages be swapped as part of reclaim? */ |
57 | int may_swap; | 57 | int may_swap; |
58 | 58 | ||
59 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 59 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
60 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 60 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
61 | * In this context, it doesn't matter that we scan the | 61 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 62 | * whole list at once. */ |
63 | int swap_cluster_max; | 63 | int swap_cluster_max; |
64 | 64 | ||
65 | int swappiness; | 65 | int swappiness; |
66 | }; | 66 | }; |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * The list of shrinker callbacks used by to apply pressure to | 69 | * The list of shrinker callbacks used by to apply pressure to |
70 | * ageable caches. | 70 | * ageable caches. |
71 | */ | 71 | */ |
72 | struct shrinker { | 72 | struct shrinker { |
73 | shrinker_t shrinker; | 73 | shrinker_t shrinker; |
74 | struct list_head list; | 74 | struct list_head list; |
75 | int seeks; /* seeks to recreate an obj */ | 75 | int seeks; /* seeks to recreate an obj */ |
76 | long nr; /* objs pending delete */ | 76 | long nr; /* objs pending delete */ |
77 | }; | 77 | }; |
78 | 78 | ||
79 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 79 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
80 | 80 | ||
81 | #ifdef ARCH_HAS_PREFETCH | 81 | #ifdef ARCH_HAS_PREFETCH |
82 | #define prefetch_prev_lru_page(_page, _base, _field) \ | 82 | #define prefetch_prev_lru_page(_page, _base, _field) \ |
83 | do { \ | 83 | do { \ |
84 | if ((_page)->lru.prev != _base) { \ | 84 | if ((_page)->lru.prev != _base) { \ |
85 | struct page *prev; \ | 85 | struct page *prev; \ |
86 | \ | 86 | \ |
87 | prev = lru_to_page(&(_page->lru)); \ | 87 | prev = lru_to_page(&(_page->lru)); \ |
88 | prefetch(&prev->_field); \ | 88 | prefetch(&prev->_field); \ |
89 | } \ | 89 | } \ |
90 | } while (0) | 90 | } while (0) |
91 | #else | 91 | #else |
92 | #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) | 92 | #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) |
93 | #endif | 93 | #endif |
94 | 94 | ||
95 | #ifdef ARCH_HAS_PREFETCHW | 95 | #ifdef ARCH_HAS_PREFETCHW |
96 | #define prefetchw_prev_lru_page(_page, _base, _field) \ | 96 | #define prefetchw_prev_lru_page(_page, _base, _field) \ |
97 | do { \ | 97 | do { \ |
98 | if ((_page)->lru.prev != _base) { \ | 98 | if ((_page)->lru.prev != _base) { \ |
99 | struct page *prev; \ | 99 | struct page *prev; \ |
100 | \ | 100 | \ |
101 | prev = lru_to_page(&(_page->lru)); \ | 101 | prev = lru_to_page(&(_page->lru)); \ |
102 | prefetchw(&prev->_field); \ | 102 | prefetchw(&prev->_field); \ |
103 | } \ | 103 | } \ |
104 | } while (0) | 104 | } while (0) |
105 | #else | 105 | #else |
106 | #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) | 106 | #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) |
107 | #endif | 107 | #endif |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * From 0 .. 100. Higher means more swappy. | 110 | * From 0 .. 100. Higher means more swappy. |
111 | */ | 111 | */ |
112 | int vm_swappiness = 60; | 112 | int vm_swappiness = 60; |
113 | static long total_memory; | 113 | static long total_memory; |
114 | 114 | ||
115 | static LIST_HEAD(shrinker_list); | 115 | static LIST_HEAD(shrinker_list); |
116 | static DECLARE_RWSEM(shrinker_rwsem); | 116 | static DECLARE_RWSEM(shrinker_rwsem); |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * Add a shrinker callback to be called from the vm | 119 | * Add a shrinker callback to be called from the vm |
120 | */ | 120 | */ |
121 | struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) | 121 | struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) |
122 | { | 122 | { |
123 | struct shrinker *shrinker; | 123 | struct shrinker *shrinker; |
124 | 124 | ||
125 | shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); | 125 | shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); |
126 | if (shrinker) { | 126 | if (shrinker) { |
127 | shrinker->shrinker = theshrinker; | 127 | shrinker->shrinker = theshrinker; |
128 | shrinker->seeks = seeks; | 128 | shrinker->seeks = seeks; |
129 | shrinker->nr = 0; | 129 | shrinker->nr = 0; |
130 | down_write(&shrinker_rwsem); | 130 | down_write(&shrinker_rwsem); |
131 | list_add_tail(&shrinker->list, &shrinker_list); | 131 | list_add_tail(&shrinker->list, &shrinker_list); |
132 | up_write(&shrinker_rwsem); | 132 | up_write(&shrinker_rwsem); |
133 | } | 133 | } |
134 | return shrinker; | 134 | return shrinker; |
135 | } | 135 | } |
136 | EXPORT_SYMBOL(set_shrinker); | 136 | EXPORT_SYMBOL(set_shrinker); |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * Remove one | 139 | * Remove one |
140 | */ | 140 | */ |
141 | void remove_shrinker(struct shrinker *shrinker) | 141 | void remove_shrinker(struct shrinker *shrinker) |
142 | { | 142 | { |
143 | down_write(&shrinker_rwsem); | 143 | down_write(&shrinker_rwsem); |
144 | list_del(&shrinker->list); | 144 | list_del(&shrinker->list); |
145 | up_write(&shrinker_rwsem); | 145 | up_write(&shrinker_rwsem); |
146 | kfree(shrinker); | 146 | kfree(shrinker); |
147 | } | 147 | } |
148 | EXPORT_SYMBOL(remove_shrinker); | 148 | EXPORT_SYMBOL(remove_shrinker); |
149 | 149 | ||
150 | #define SHRINK_BATCH 128 | 150 | #define SHRINK_BATCH 128 |
151 | /* | 151 | /* |
152 | * Call the shrink functions to age shrinkable caches | 152 | * Call the shrink functions to age shrinkable caches |
153 | * | 153 | * |
154 | * Here we assume it costs one seek to replace a lru page and that it also | 154 | * Here we assume it costs one seek to replace a lru page and that it also |
155 | * takes a seek to recreate a cache object. With this in mind we age equal | 155 | * takes a seek to recreate a cache object. With this in mind we age equal |
156 | * percentages of the lru and ageable caches. This should balance the seeks | 156 | * percentages of the lru and ageable caches. This should balance the seeks |
157 | * generated by these structures. | 157 | * generated by these structures. |
158 | * | 158 | * |
159 | * If the vm encounted mapped pages on the LRU it increase the pressure on | 159 | * If the vm encounted mapped pages on the LRU it increase the pressure on |
160 | * slab to avoid swapping. | 160 | * slab to avoid swapping. |
161 | * | 161 | * |
162 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | 162 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. |
163 | * | 163 | * |
164 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 164 | * `lru_pages' represents the number of on-LRU pages in all the zones which |
165 | * are eligible for the caller's allocation attempt. It is used for balancing | 165 | * are eligible for the caller's allocation attempt. It is used for balancing |
166 | * slab reclaim versus page reclaim. | 166 | * slab reclaim versus page reclaim. |
167 | * | 167 | * |
168 | * Returns the number of slab objects which we shrunk. | 168 | * Returns the number of slab objects which we shrunk. |
169 | */ | 169 | */ |
170 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | 170 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, |
171 | unsigned long lru_pages) | 171 | unsigned long lru_pages) |
172 | { | 172 | { |
173 | struct shrinker *shrinker; | 173 | struct shrinker *shrinker; |
174 | unsigned long ret = 0; | 174 | unsigned long ret = 0; |
175 | 175 | ||
176 | if (scanned == 0) | 176 | if (scanned == 0) |
177 | scanned = SWAP_CLUSTER_MAX; | 177 | scanned = SWAP_CLUSTER_MAX; |
178 | 178 | ||
179 | if (!down_read_trylock(&shrinker_rwsem)) | 179 | if (!down_read_trylock(&shrinker_rwsem)) |
180 | return 1; /* Assume we'll be able to shrink next time */ | 180 | return 1; /* Assume we'll be able to shrink next time */ |
181 | 181 | ||
182 | list_for_each_entry(shrinker, &shrinker_list, list) { | 182 | list_for_each_entry(shrinker, &shrinker_list, list) { |
183 | unsigned long long delta; | 183 | unsigned long long delta; |
184 | unsigned long total_scan; | 184 | unsigned long total_scan; |
185 | unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); | 185 | unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); |
186 | 186 | ||
187 | delta = (4 * scanned) / shrinker->seeks; | 187 | delta = (4 * scanned) / shrinker->seeks; |
188 | delta *= max_pass; | 188 | delta *= max_pass; |
189 | do_div(delta, lru_pages + 1); | 189 | do_div(delta, lru_pages + 1); |
190 | shrinker->nr += delta; | 190 | shrinker->nr += delta; |
191 | if (shrinker->nr < 0) { | 191 | if (shrinker->nr < 0) { |
192 | printk(KERN_ERR "%s: nr=%ld\n", | 192 | printk(KERN_ERR "%s: nr=%ld\n", |
193 | __FUNCTION__, shrinker->nr); | 193 | __FUNCTION__, shrinker->nr); |
194 | shrinker->nr = max_pass; | 194 | shrinker->nr = max_pass; |
195 | } | 195 | } |
196 | 196 | ||
197 | /* | 197 | /* |
198 | * Avoid risking looping forever due to too large nr value: | 198 | * Avoid risking looping forever due to too large nr value: |
199 | * never try to free more than twice the estimate number of | 199 | * never try to free more than twice the estimate number of |
200 | * freeable entries. | 200 | * freeable entries. |
201 | */ | 201 | */ |
202 | if (shrinker->nr > max_pass * 2) | 202 | if (shrinker->nr > max_pass * 2) |
203 | shrinker->nr = max_pass * 2; | 203 | shrinker->nr = max_pass * 2; |
204 | 204 | ||
205 | total_scan = shrinker->nr; | 205 | total_scan = shrinker->nr; |
206 | shrinker->nr = 0; | 206 | shrinker->nr = 0; |
207 | 207 | ||
208 | while (total_scan >= SHRINK_BATCH) { | 208 | while (total_scan >= SHRINK_BATCH) { |
209 | long this_scan = SHRINK_BATCH; | 209 | long this_scan = SHRINK_BATCH; |
210 | int shrink_ret; | 210 | int shrink_ret; |
211 | int nr_before; | 211 | int nr_before; |
212 | 212 | ||
213 | nr_before = (*shrinker->shrinker)(0, gfp_mask); | 213 | nr_before = (*shrinker->shrinker)(0, gfp_mask); |
214 | shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); | 214 | shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); |
215 | if (shrink_ret == -1) | 215 | if (shrink_ret == -1) |
216 | break; | 216 | break; |
217 | if (shrink_ret < nr_before) | 217 | if (shrink_ret < nr_before) |
218 | ret += nr_before - shrink_ret; | 218 | ret += nr_before - shrink_ret; |
219 | mod_page_state(slabs_scanned, this_scan); | 219 | mod_page_state(slabs_scanned, this_scan); |
220 | total_scan -= this_scan; | 220 | total_scan -= this_scan; |
221 | 221 | ||
222 | cond_resched(); | 222 | cond_resched(); |
223 | } | 223 | } |
224 | 224 | ||
225 | shrinker->nr += total_scan; | 225 | shrinker->nr += total_scan; |
226 | } | 226 | } |
227 | up_read(&shrinker_rwsem); | 227 | up_read(&shrinker_rwsem); |
228 | return ret; | 228 | return ret; |
229 | } | 229 | } |
230 | 230 | ||
231 | /* Called without lock on whether page is mapped, so answer is unstable */ | 231 | /* Called without lock on whether page is mapped, so answer is unstable */ |
232 | static inline int page_mapping_inuse(struct page *page) | 232 | static inline int page_mapping_inuse(struct page *page) |
233 | { | 233 | { |
234 | struct address_space *mapping; | 234 | struct address_space *mapping; |
235 | 235 | ||
236 | /* Page is in somebody's page tables. */ | 236 | /* Page is in somebody's page tables. */ |
237 | if (page_mapped(page)) | 237 | if (page_mapped(page)) |
238 | return 1; | 238 | return 1; |
239 | 239 | ||
240 | /* Be more reluctant to reclaim swapcache than pagecache */ | 240 | /* Be more reluctant to reclaim swapcache than pagecache */ |
241 | if (PageSwapCache(page)) | 241 | if (PageSwapCache(page)) |
242 | return 1; | 242 | return 1; |
243 | 243 | ||
244 | mapping = page_mapping(page); | 244 | mapping = page_mapping(page); |
245 | if (!mapping) | 245 | if (!mapping) |
246 | return 0; | 246 | return 0; |
247 | 247 | ||
248 | /* File is mmap'd by somebody? */ | 248 | /* File is mmap'd by somebody? */ |
249 | return mapping_mapped(mapping); | 249 | return mapping_mapped(mapping); |
250 | } | 250 | } |
251 | 251 | ||
252 | static inline int is_page_cache_freeable(struct page *page) | 252 | static inline int is_page_cache_freeable(struct page *page) |
253 | { | 253 | { |
254 | return page_count(page) - !!PagePrivate(page) == 2; | 254 | return page_count(page) - !!PagePrivate(page) == 2; |
255 | } | 255 | } |
256 | 256 | ||
257 | static int may_write_to_queue(struct backing_dev_info *bdi) | 257 | static int may_write_to_queue(struct backing_dev_info *bdi) |
258 | { | 258 | { |
259 | if (current->flags & PF_SWAPWRITE) | 259 | if (current->flags & PF_SWAPWRITE) |
260 | return 1; | 260 | return 1; |
261 | if (!bdi_write_congested(bdi)) | 261 | if (!bdi_write_congested(bdi)) |
262 | return 1; | 262 | return 1; |
263 | if (bdi == current->backing_dev_info) | 263 | if (bdi == current->backing_dev_info) |
264 | return 1; | 264 | return 1; |
265 | return 0; | 265 | return 0; |
266 | } | 266 | } |
267 | 267 | ||
268 | /* | 268 | /* |
269 | * We detected a synchronous write error writing a page out. Probably | 269 | * We detected a synchronous write error writing a page out. Probably |
270 | * -ENOSPC. We need to propagate that into the address_space for a subsequent | 270 | * -ENOSPC. We need to propagate that into the address_space for a subsequent |
271 | * fsync(), msync() or close(). | 271 | * fsync(), msync() or close(). |
272 | * | 272 | * |
273 | * The tricky part is that after writepage we cannot touch the mapping: nothing | 273 | * The tricky part is that after writepage we cannot touch the mapping: nothing |
274 | * prevents it from being freed up. But we have a ref on the page and once | 274 | * prevents it from being freed up. But we have a ref on the page and once |
275 | * that page is locked, the mapping is pinned. | 275 | * that page is locked, the mapping is pinned. |
276 | * | 276 | * |
277 | * We're allowed to run sleeping lock_page() here because we know the caller has | 277 | * We're allowed to run sleeping lock_page() here because we know the caller has |
278 | * __GFP_FS. | 278 | * __GFP_FS. |
279 | */ | 279 | */ |
280 | static void handle_write_error(struct address_space *mapping, | 280 | static void handle_write_error(struct address_space *mapping, |
281 | struct page *page, int error) | 281 | struct page *page, int error) |
282 | { | 282 | { |
283 | lock_page(page); | 283 | lock_page(page); |
284 | if (page_mapping(page) == mapping) { | 284 | if (page_mapping(page) == mapping) { |
285 | if (error == -ENOSPC) | 285 | if (error == -ENOSPC) |
286 | set_bit(AS_ENOSPC, &mapping->flags); | 286 | set_bit(AS_ENOSPC, &mapping->flags); |
287 | else | 287 | else |
288 | set_bit(AS_EIO, &mapping->flags); | 288 | set_bit(AS_EIO, &mapping->flags); |
289 | } | 289 | } |
290 | unlock_page(page); | 290 | unlock_page(page); |
291 | } | 291 | } |
292 | 292 | ||
293 | /* possible outcome of pageout() */ | ||
294 | typedef enum { | ||
295 | /* failed to write page out, page is locked */ | ||
296 | PAGE_KEEP, | ||
297 | /* move page to the active list, page is locked */ | ||
298 | PAGE_ACTIVATE, | ||
299 | /* page has been sent to the disk successfully, page is unlocked */ | ||
300 | PAGE_SUCCESS, | ||
301 | /* page is clean and locked */ | ||
302 | PAGE_CLEAN, | ||
303 | } pageout_t; | ||
304 | |||
293 | /* | 305 | /* |
294 | * pageout is called by shrink_page_list() for each dirty page. | 306 | * pageout is called by shrink_page_list() for each dirty page. |
295 | * Calls ->writepage(). | 307 | * Calls ->writepage(). |
296 | */ | 308 | */ |
297 | pageout_t pageout(struct page *page, struct address_space *mapping) | 309 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
298 | { | 310 | { |
299 | /* | 311 | /* |
300 | * If the page is dirty, only perform writeback if that write | 312 | * If the page is dirty, only perform writeback if that write |
301 | * will be non-blocking. To prevent this allocation from being | 313 | * will be non-blocking. To prevent this allocation from being |
302 | * stalled by pagecache activity. But note that there may be | 314 | * stalled by pagecache activity. But note that there may be |
303 | * stalls if we need to run get_block(). We could test | 315 | * stalls if we need to run get_block(). We could test |
304 | * PagePrivate for that. | 316 | * PagePrivate for that. |
305 | * | 317 | * |
306 | * If this process is currently in generic_file_write() against | 318 | * If this process is currently in generic_file_write() against |
307 | * this page's queue, we can perform writeback even if that | 319 | * this page's queue, we can perform writeback even if that |
308 | * will block. | 320 | * will block. |
309 | * | 321 | * |
310 | * If the page is swapcache, write it back even if that would | 322 | * If the page is swapcache, write it back even if that would |
311 | * block, for some throttling. This happens by accident, because | 323 | * block, for some throttling. This happens by accident, because |
312 | * swap_backing_dev_info is bust: it doesn't reflect the | 324 | * swap_backing_dev_info is bust: it doesn't reflect the |
313 | * congestion state of the swapdevs. Easy to fix, if needed. | 325 | * congestion state of the swapdevs. Easy to fix, if needed. |
314 | * See swapfile.c:page_queue_congested(). | 326 | * See swapfile.c:page_queue_congested(). |
315 | */ | 327 | */ |
316 | if (!is_page_cache_freeable(page)) | 328 | if (!is_page_cache_freeable(page)) |
317 | return PAGE_KEEP; | 329 | return PAGE_KEEP; |
318 | if (!mapping) { | 330 | if (!mapping) { |
319 | /* | 331 | /* |
320 | * Some data journaling orphaned pages can have | 332 | * Some data journaling orphaned pages can have |
321 | * page->mapping == NULL while being dirty with clean buffers. | 333 | * page->mapping == NULL while being dirty with clean buffers. |
322 | */ | 334 | */ |
323 | if (PagePrivate(page)) { | 335 | if (PagePrivate(page)) { |
324 | if (try_to_free_buffers(page)) { | 336 | if (try_to_free_buffers(page)) { |
325 | ClearPageDirty(page); | 337 | ClearPageDirty(page); |
326 | printk("%s: orphaned page\n", __FUNCTION__); | 338 | printk("%s: orphaned page\n", __FUNCTION__); |
327 | return PAGE_CLEAN; | 339 | return PAGE_CLEAN; |
328 | } | 340 | } |
329 | } | 341 | } |
330 | return PAGE_KEEP; | 342 | return PAGE_KEEP; |
331 | } | 343 | } |
332 | if (mapping->a_ops->writepage == NULL) | 344 | if (mapping->a_ops->writepage == NULL) |
333 | return PAGE_ACTIVATE; | 345 | return PAGE_ACTIVATE; |
334 | if (!may_write_to_queue(mapping->backing_dev_info)) | 346 | if (!may_write_to_queue(mapping->backing_dev_info)) |
335 | return PAGE_KEEP; | 347 | return PAGE_KEEP; |
336 | 348 | ||
337 | if (clear_page_dirty_for_io(page)) { | 349 | if (clear_page_dirty_for_io(page)) { |
338 | int res; | 350 | int res; |
339 | struct writeback_control wbc = { | 351 | struct writeback_control wbc = { |
340 | .sync_mode = WB_SYNC_NONE, | 352 | .sync_mode = WB_SYNC_NONE, |
341 | .nr_to_write = SWAP_CLUSTER_MAX, | 353 | .nr_to_write = SWAP_CLUSTER_MAX, |
342 | .range_start = 0, | 354 | .range_start = 0, |
343 | .range_end = LLONG_MAX, | 355 | .range_end = LLONG_MAX, |
344 | .nonblocking = 1, | 356 | .nonblocking = 1, |
345 | .for_reclaim = 1, | 357 | .for_reclaim = 1, |
346 | }; | 358 | }; |
347 | 359 | ||
348 | SetPageReclaim(page); | 360 | SetPageReclaim(page); |
349 | res = mapping->a_ops->writepage(page, &wbc); | 361 | res = mapping->a_ops->writepage(page, &wbc); |
350 | if (res < 0) | 362 | if (res < 0) |
351 | handle_write_error(mapping, page, res); | 363 | handle_write_error(mapping, page, res); |
352 | if (res == AOP_WRITEPAGE_ACTIVATE) { | 364 | if (res == AOP_WRITEPAGE_ACTIVATE) { |
353 | ClearPageReclaim(page); | 365 | ClearPageReclaim(page); |
354 | return PAGE_ACTIVATE; | 366 | return PAGE_ACTIVATE; |
355 | } | 367 | } |
356 | if (!PageWriteback(page)) { | 368 | if (!PageWriteback(page)) { |
357 | /* synchronous write or broken a_ops? */ | 369 | /* synchronous write or broken a_ops? */ |
358 | ClearPageReclaim(page); | 370 | ClearPageReclaim(page); |
359 | } | 371 | } |
360 | 372 | ||
361 | return PAGE_SUCCESS; | 373 | return PAGE_SUCCESS; |
362 | } | 374 | } |
363 | 375 | ||
364 | return PAGE_CLEAN; | 376 | return PAGE_CLEAN; |
365 | } | 377 | } |
366 | 378 | ||
367 | int remove_mapping(struct address_space *mapping, struct page *page) | 379 | int remove_mapping(struct address_space *mapping, struct page *page) |
368 | { | 380 | { |
369 | if (!mapping) | 381 | if (!mapping) |
370 | return 0; /* truncate got there first */ | 382 | return 0; /* truncate got there first */ |
371 | 383 | ||
372 | write_lock_irq(&mapping->tree_lock); | 384 | write_lock_irq(&mapping->tree_lock); |
373 | 385 | ||
374 | /* | 386 | /* |
375 | * The non-racy check for busy page. It is critical to check | 387 | * The non-racy check for busy page. It is critical to check |
376 | * PageDirty _after_ making sure that the page is freeable and | 388 | * PageDirty _after_ making sure that the page is freeable and |
377 | * not in use by anybody. (pagecache + us == 2) | 389 | * not in use by anybody. (pagecache + us == 2) |
378 | */ | 390 | */ |
379 | if (unlikely(page_count(page) != 2)) | 391 | if (unlikely(page_count(page) != 2)) |
380 | goto cannot_free; | 392 | goto cannot_free; |
381 | smp_rmb(); | 393 | smp_rmb(); |
382 | if (unlikely(PageDirty(page))) | 394 | if (unlikely(PageDirty(page))) |
383 | goto cannot_free; | 395 | goto cannot_free; |
384 | 396 | ||
385 | if (PageSwapCache(page)) { | 397 | if (PageSwapCache(page)) { |
386 | swp_entry_t swap = { .val = page_private(page) }; | 398 | swp_entry_t swap = { .val = page_private(page) }; |
387 | __delete_from_swap_cache(page); | 399 | __delete_from_swap_cache(page); |
388 | write_unlock_irq(&mapping->tree_lock); | 400 | write_unlock_irq(&mapping->tree_lock); |
389 | swap_free(swap); | 401 | swap_free(swap); |
390 | __put_page(page); /* The pagecache ref */ | 402 | __put_page(page); /* The pagecache ref */ |
391 | return 1; | 403 | return 1; |
392 | } | 404 | } |
393 | 405 | ||
394 | __remove_from_page_cache(page); | 406 | __remove_from_page_cache(page); |
395 | write_unlock_irq(&mapping->tree_lock); | 407 | write_unlock_irq(&mapping->tree_lock); |
396 | __put_page(page); | 408 | __put_page(page); |
397 | return 1; | 409 | return 1; |
398 | 410 | ||
399 | cannot_free: | 411 | cannot_free: |
400 | write_unlock_irq(&mapping->tree_lock); | 412 | write_unlock_irq(&mapping->tree_lock); |
401 | return 0; | 413 | return 0; |
402 | } | 414 | } |
403 | 415 | ||
404 | /* | 416 | /* |
405 | * shrink_page_list() returns the number of reclaimed pages | 417 | * shrink_page_list() returns the number of reclaimed pages |
406 | */ | 418 | */ |
407 | static unsigned long shrink_page_list(struct list_head *page_list, | 419 | static unsigned long shrink_page_list(struct list_head *page_list, |
408 | struct scan_control *sc) | 420 | struct scan_control *sc) |
409 | { | 421 | { |
410 | LIST_HEAD(ret_pages); | 422 | LIST_HEAD(ret_pages); |
411 | struct pagevec freed_pvec; | 423 | struct pagevec freed_pvec; |
412 | int pgactivate = 0; | 424 | int pgactivate = 0; |
413 | unsigned long nr_reclaimed = 0; | 425 | unsigned long nr_reclaimed = 0; |
414 | 426 | ||
415 | cond_resched(); | 427 | cond_resched(); |
416 | 428 | ||
417 | pagevec_init(&freed_pvec, 1); | 429 | pagevec_init(&freed_pvec, 1); |
418 | while (!list_empty(page_list)) { | 430 | while (!list_empty(page_list)) { |
419 | struct address_space *mapping; | 431 | struct address_space *mapping; |
420 | struct page *page; | 432 | struct page *page; |
421 | int may_enter_fs; | 433 | int may_enter_fs; |
422 | int referenced; | 434 | int referenced; |
423 | 435 | ||
424 | cond_resched(); | 436 | cond_resched(); |
425 | 437 | ||
426 | page = lru_to_page(page_list); | 438 | page = lru_to_page(page_list); |
427 | list_del(&page->lru); | 439 | list_del(&page->lru); |
428 | 440 | ||
429 | if (TestSetPageLocked(page)) | 441 | if (TestSetPageLocked(page)) |
430 | goto keep; | 442 | goto keep; |
431 | 443 | ||
432 | BUG_ON(PageActive(page)); | 444 | BUG_ON(PageActive(page)); |
433 | 445 | ||
434 | sc->nr_scanned++; | 446 | sc->nr_scanned++; |
435 | 447 | ||
436 | if (!sc->may_swap && page_mapped(page)) | 448 | if (!sc->may_swap && page_mapped(page)) |
437 | goto keep_locked; | 449 | goto keep_locked; |
438 | 450 | ||
439 | /* Double the slab pressure for mapped and swapcache pages */ | 451 | /* Double the slab pressure for mapped and swapcache pages */ |
440 | if (page_mapped(page) || PageSwapCache(page)) | 452 | if (page_mapped(page) || PageSwapCache(page)) |
441 | sc->nr_scanned++; | 453 | sc->nr_scanned++; |
442 | 454 | ||
443 | if (PageWriteback(page)) | 455 | if (PageWriteback(page)) |
444 | goto keep_locked; | 456 | goto keep_locked; |
445 | 457 | ||
446 | referenced = page_referenced(page, 1); | 458 | referenced = page_referenced(page, 1); |
447 | /* In active use or really unfreeable? Activate it. */ | 459 | /* In active use or really unfreeable? Activate it. */ |
448 | if (referenced && page_mapping_inuse(page)) | 460 | if (referenced && page_mapping_inuse(page)) |
449 | goto activate_locked; | 461 | goto activate_locked; |
450 | 462 | ||
451 | #ifdef CONFIG_SWAP | 463 | #ifdef CONFIG_SWAP |
452 | /* | 464 | /* |
453 | * Anonymous process memory has backing store? | 465 | * Anonymous process memory has backing store? |
454 | * Try to allocate it some swap space here. | 466 | * Try to allocate it some swap space here. |
455 | */ | 467 | */ |
456 | if (PageAnon(page) && !PageSwapCache(page)) | 468 | if (PageAnon(page) && !PageSwapCache(page)) |
457 | if (!add_to_swap(page, GFP_ATOMIC)) | 469 | if (!add_to_swap(page, GFP_ATOMIC)) |
458 | goto activate_locked; | 470 | goto activate_locked; |
459 | #endif /* CONFIG_SWAP */ | 471 | #endif /* CONFIG_SWAP */ |
460 | 472 | ||
461 | mapping = page_mapping(page); | 473 | mapping = page_mapping(page); |
462 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 474 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
463 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 475 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
464 | 476 | ||
465 | /* | 477 | /* |
466 | * The page is mapped into the page tables of one or more | 478 | * The page is mapped into the page tables of one or more |
467 | * processes. Try to unmap it here. | 479 | * processes. Try to unmap it here. |
468 | */ | 480 | */ |
469 | if (page_mapped(page) && mapping) { | 481 | if (page_mapped(page) && mapping) { |
470 | switch (try_to_unmap(page, 0)) { | 482 | switch (try_to_unmap(page, 0)) { |
471 | case SWAP_FAIL: | 483 | case SWAP_FAIL: |
472 | goto activate_locked; | 484 | goto activate_locked; |
473 | case SWAP_AGAIN: | 485 | case SWAP_AGAIN: |
474 | goto keep_locked; | 486 | goto keep_locked; |
475 | case SWAP_SUCCESS: | 487 | case SWAP_SUCCESS: |
476 | ; /* try to free the page below */ | 488 | ; /* try to free the page below */ |
477 | } | 489 | } |
478 | } | 490 | } |
479 | 491 | ||
480 | if (PageDirty(page)) { | 492 | if (PageDirty(page)) { |
481 | if (referenced) | 493 | if (referenced) |
482 | goto keep_locked; | 494 | goto keep_locked; |
483 | if (!may_enter_fs) | 495 | if (!may_enter_fs) |
484 | goto keep_locked; | 496 | goto keep_locked; |
485 | if (!sc->may_writepage) | 497 | if (!sc->may_writepage) |
486 | goto keep_locked; | 498 | goto keep_locked; |
487 | 499 | ||
488 | /* Page is dirty, try to write it out here */ | 500 | /* Page is dirty, try to write it out here */ |
489 | switch(pageout(page, mapping)) { | 501 | switch(pageout(page, mapping)) { |
490 | case PAGE_KEEP: | 502 | case PAGE_KEEP: |
491 | goto keep_locked; | 503 | goto keep_locked; |
492 | case PAGE_ACTIVATE: | 504 | case PAGE_ACTIVATE: |
493 | goto activate_locked; | 505 | goto activate_locked; |
494 | case PAGE_SUCCESS: | 506 | case PAGE_SUCCESS: |
495 | if (PageWriteback(page) || PageDirty(page)) | 507 | if (PageWriteback(page) || PageDirty(page)) |
496 | goto keep; | 508 | goto keep; |
497 | /* | 509 | /* |
498 | * A synchronous write - probably a ramdisk. Go | 510 | * A synchronous write - probably a ramdisk. Go |
499 | * ahead and try to reclaim the page. | 511 | * ahead and try to reclaim the page. |
500 | */ | 512 | */ |
501 | if (TestSetPageLocked(page)) | 513 | if (TestSetPageLocked(page)) |
502 | goto keep; | 514 | goto keep; |
503 | if (PageDirty(page) || PageWriteback(page)) | 515 | if (PageDirty(page) || PageWriteback(page)) |
504 | goto keep_locked; | 516 | goto keep_locked; |
505 | mapping = page_mapping(page); | 517 | mapping = page_mapping(page); |
506 | case PAGE_CLEAN: | 518 | case PAGE_CLEAN: |
507 | ; /* try to free the page below */ | 519 | ; /* try to free the page below */ |
508 | } | 520 | } |
509 | } | 521 | } |
510 | 522 | ||
511 | /* | 523 | /* |
512 | * If the page has buffers, try to free the buffer mappings | 524 | * If the page has buffers, try to free the buffer mappings |
513 | * associated with this page. If we succeed we try to free | 525 | * associated with this page. If we succeed we try to free |
514 | * the page as well. | 526 | * the page as well. |
515 | * | 527 | * |
516 | * We do this even if the page is PageDirty(). | 528 | * We do this even if the page is PageDirty(). |
517 | * try_to_release_page() does not perform I/O, but it is | 529 | * try_to_release_page() does not perform I/O, but it is |
518 | * possible for a page to have PageDirty set, but it is actually | 530 | * possible for a page to have PageDirty set, but it is actually |
519 | * clean (all its buffers are clean). This happens if the | 531 | * clean (all its buffers are clean). This happens if the |
520 | * buffers were written out directly, with submit_bh(). ext3 | 532 | * buffers were written out directly, with submit_bh(). ext3 |
521 | * will do this, as well as the blockdev mapping. | 533 | * will do this, as well as the blockdev mapping. |
522 | * try_to_release_page() will discover that cleanness and will | 534 | * try_to_release_page() will discover that cleanness and will |
523 | * drop the buffers and mark the page clean - it can be freed. | 535 | * drop the buffers and mark the page clean - it can be freed. |
524 | * | 536 | * |
525 | * Rarely, pages can have buffers and no ->mapping. These are | 537 | * Rarely, pages can have buffers and no ->mapping. These are |
526 | * the pages which were not successfully invalidated in | 538 | * the pages which were not successfully invalidated in |
527 | * truncate_complete_page(). We try to drop those buffers here | 539 | * truncate_complete_page(). We try to drop those buffers here |
528 | * and if that worked, and the page is no longer mapped into | 540 | * and if that worked, and the page is no longer mapped into |
529 | * process address space (page_count == 1) it can be freed. | 541 | * process address space (page_count == 1) it can be freed. |
530 | * Otherwise, leave the page on the LRU so it is swappable. | 542 | * Otherwise, leave the page on the LRU so it is swappable. |
531 | */ | 543 | */ |
532 | if (PagePrivate(page)) { | 544 | if (PagePrivate(page)) { |
533 | if (!try_to_release_page(page, sc->gfp_mask)) | 545 | if (!try_to_release_page(page, sc->gfp_mask)) |
534 | goto activate_locked; | 546 | goto activate_locked; |
535 | if (!mapping && page_count(page) == 1) | 547 | if (!mapping && page_count(page) == 1) |
536 | goto free_it; | 548 | goto free_it; |
537 | } | 549 | } |
538 | 550 | ||
539 | if (!remove_mapping(mapping, page)) | 551 | if (!remove_mapping(mapping, page)) |
540 | goto keep_locked; | 552 | goto keep_locked; |
541 | 553 | ||
542 | free_it: | 554 | free_it: |
543 | unlock_page(page); | 555 | unlock_page(page); |
544 | nr_reclaimed++; | 556 | nr_reclaimed++; |
545 | if (!pagevec_add(&freed_pvec, page)) | 557 | if (!pagevec_add(&freed_pvec, page)) |
546 | __pagevec_release_nonlru(&freed_pvec); | 558 | __pagevec_release_nonlru(&freed_pvec); |
547 | continue; | 559 | continue; |
548 | 560 | ||
549 | activate_locked: | 561 | activate_locked: |
550 | SetPageActive(page); | 562 | SetPageActive(page); |
551 | pgactivate++; | 563 | pgactivate++; |
552 | keep_locked: | 564 | keep_locked: |
553 | unlock_page(page); | 565 | unlock_page(page); |
554 | keep: | 566 | keep: |
555 | list_add(&page->lru, &ret_pages); | 567 | list_add(&page->lru, &ret_pages); |
556 | BUG_ON(PageLRU(page)); | 568 | BUG_ON(PageLRU(page)); |
557 | } | 569 | } |
558 | list_splice(&ret_pages, page_list); | 570 | list_splice(&ret_pages, page_list); |
559 | if (pagevec_count(&freed_pvec)) | 571 | if (pagevec_count(&freed_pvec)) |
560 | __pagevec_release_nonlru(&freed_pvec); | 572 | __pagevec_release_nonlru(&freed_pvec); |
561 | mod_page_state(pgactivate, pgactivate); | 573 | mod_page_state(pgactivate, pgactivate); |
562 | return nr_reclaimed; | 574 | return nr_reclaimed; |
563 | } | 575 | } |
564 | 576 | ||
565 | /* | 577 | /* |
566 | * zone->lru_lock is heavily contended. Some of the functions that | 578 | * zone->lru_lock is heavily contended. Some of the functions that |
567 | * shrink the lists perform better by taking out a batch of pages | 579 | * shrink the lists perform better by taking out a batch of pages |
568 | * and working on them outside the LRU lock. | 580 | * and working on them outside the LRU lock. |
569 | * | 581 | * |
570 | * For pagecache intensive workloads, this function is the hottest | 582 | * For pagecache intensive workloads, this function is the hottest |
571 | * spot in the kernel (apart from copy_*_user functions). | 583 | * spot in the kernel (apart from copy_*_user functions). |
572 | * | 584 | * |
573 | * Appropriate locks must be held before calling this function. | 585 | * Appropriate locks must be held before calling this function. |
574 | * | 586 | * |
575 | * @nr_to_scan: The number of pages to look through on the list. | 587 | * @nr_to_scan: The number of pages to look through on the list. |
576 | * @src: The LRU list to pull pages off. | 588 | * @src: The LRU list to pull pages off. |
577 | * @dst: The temp list to put pages on to. | 589 | * @dst: The temp list to put pages on to. |
578 | * @scanned: The number of pages that were scanned. | 590 | * @scanned: The number of pages that were scanned. |
579 | * | 591 | * |
580 | * returns how many pages were moved onto *@dst. | 592 | * returns how many pages were moved onto *@dst. |
581 | */ | 593 | */ |
582 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 594 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
583 | struct list_head *src, struct list_head *dst, | 595 | struct list_head *src, struct list_head *dst, |
584 | unsigned long *scanned) | 596 | unsigned long *scanned) |
585 | { | 597 | { |
586 | unsigned long nr_taken = 0; | 598 | unsigned long nr_taken = 0; |
587 | struct page *page; | 599 | struct page *page; |
588 | unsigned long scan; | 600 | unsigned long scan; |
589 | 601 | ||
590 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 602 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
591 | struct list_head *target; | 603 | struct list_head *target; |
592 | page = lru_to_page(src); | 604 | page = lru_to_page(src); |
593 | prefetchw_prev_lru_page(page, src, flags); | 605 | prefetchw_prev_lru_page(page, src, flags); |
594 | 606 | ||
595 | BUG_ON(!PageLRU(page)); | 607 | BUG_ON(!PageLRU(page)); |
596 | 608 | ||
597 | list_del(&page->lru); | 609 | list_del(&page->lru); |
598 | target = src; | 610 | target = src; |
599 | if (likely(get_page_unless_zero(page))) { | 611 | if (likely(get_page_unless_zero(page))) { |
600 | /* | 612 | /* |
601 | * Be careful not to clear PageLRU until after we're | 613 | * Be careful not to clear PageLRU until after we're |
602 | * sure the page is not being freed elsewhere -- the | 614 | * sure the page is not being freed elsewhere -- the |
603 | * page release code relies on it. | 615 | * page release code relies on it. |
604 | */ | 616 | */ |
605 | ClearPageLRU(page); | 617 | ClearPageLRU(page); |
606 | target = dst; | 618 | target = dst; |
607 | nr_taken++; | 619 | nr_taken++; |
608 | } /* else it is being freed elsewhere */ | 620 | } /* else it is being freed elsewhere */ |
609 | 621 | ||
610 | list_add(&page->lru, target); | 622 | list_add(&page->lru, target); |
611 | } | 623 | } |
612 | 624 | ||
613 | *scanned = scan; | 625 | *scanned = scan; |
614 | return nr_taken; | 626 | return nr_taken; |
615 | } | 627 | } |
616 | 628 | ||
617 | /* | 629 | /* |
618 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 630 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
619 | * of reclaimed pages | 631 | * of reclaimed pages |
620 | */ | 632 | */ |
621 | static unsigned long shrink_inactive_list(unsigned long max_scan, | 633 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
622 | struct zone *zone, struct scan_control *sc) | 634 | struct zone *zone, struct scan_control *sc) |
623 | { | 635 | { |
624 | LIST_HEAD(page_list); | 636 | LIST_HEAD(page_list); |
625 | struct pagevec pvec; | 637 | struct pagevec pvec; |
626 | unsigned long nr_scanned = 0; | 638 | unsigned long nr_scanned = 0; |
627 | unsigned long nr_reclaimed = 0; | 639 | unsigned long nr_reclaimed = 0; |
628 | 640 | ||
629 | pagevec_init(&pvec, 1); | 641 | pagevec_init(&pvec, 1); |
630 | 642 | ||
631 | lru_add_drain(); | 643 | lru_add_drain(); |
632 | spin_lock_irq(&zone->lru_lock); | 644 | spin_lock_irq(&zone->lru_lock); |
633 | do { | 645 | do { |
634 | struct page *page; | 646 | struct page *page; |
635 | unsigned long nr_taken; | 647 | unsigned long nr_taken; |
636 | unsigned long nr_scan; | 648 | unsigned long nr_scan; |
637 | unsigned long nr_freed; | 649 | unsigned long nr_freed; |
638 | 650 | ||
639 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 651 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, |
640 | &zone->inactive_list, | 652 | &zone->inactive_list, |
641 | &page_list, &nr_scan); | 653 | &page_list, &nr_scan); |
642 | zone->nr_inactive -= nr_taken; | 654 | zone->nr_inactive -= nr_taken; |
643 | zone->pages_scanned += nr_scan; | 655 | zone->pages_scanned += nr_scan; |
644 | spin_unlock_irq(&zone->lru_lock); | 656 | spin_unlock_irq(&zone->lru_lock); |
645 | 657 | ||
646 | nr_scanned += nr_scan; | 658 | nr_scanned += nr_scan; |
647 | nr_freed = shrink_page_list(&page_list, sc); | 659 | nr_freed = shrink_page_list(&page_list, sc); |
648 | nr_reclaimed += nr_freed; | 660 | nr_reclaimed += nr_freed; |
649 | local_irq_disable(); | 661 | local_irq_disable(); |
650 | if (current_is_kswapd()) { | 662 | if (current_is_kswapd()) { |
651 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 663 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); |
652 | __mod_page_state(kswapd_steal, nr_freed); | 664 | __mod_page_state(kswapd_steal, nr_freed); |
653 | } else | 665 | } else |
654 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 666 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); |
655 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 667 | __mod_page_state_zone(zone, pgsteal, nr_freed); |
656 | 668 | ||
657 | if (nr_taken == 0) | 669 | if (nr_taken == 0) |
658 | goto done; | 670 | goto done; |
659 | 671 | ||
660 | spin_lock(&zone->lru_lock); | 672 | spin_lock(&zone->lru_lock); |
661 | /* | 673 | /* |
662 | * Put back any unfreeable pages. | 674 | * Put back any unfreeable pages. |
663 | */ | 675 | */ |
664 | while (!list_empty(&page_list)) { | 676 | while (!list_empty(&page_list)) { |
665 | page = lru_to_page(&page_list); | 677 | page = lru_to_page(&page_list); |
666 | BUG_ON(PageLRU(page)); | 678 | BUG_ON(PageLRU(page)); |
667 | SetPageLRU(page); | 679 | SetPageLRU(page); |
668 | list_del(&page->lru); | 680 | list_del(&page->lru); |
669 | if (PageActive(page)) | 681 | if (PageActive(page)) |
670 | add_page_to_active_list(zone, page); | 682 | add_page_to_active_list(zone, page); |
671 | else | 683 | else |
672 | add_page_to_inactive_list(zone, page); | 684 | add_page_to_inactive_list(zone, page); |
673 | if (!pagevec_add(&pvec, page)) { | 685 | if (!pagevec_add(&pvec, page)) { |
674 | spin_unlock_irq(&zone->lru_lock); | 686 | spin_unlock_irq(&zone->lru_lock); |
675 | __pagevec_release(&pvec); | 687 | __pagevec_release(&pvec); |
676 | spin_lock_irq(&zone->lru_lock); | 688 | spin_lock_irq(&zone->lru_lock); |
677 | } | 689 | } |
678 | } | 690 | } |
679 | } while (nr_scanned < max_scan); | 691 | } while (nr_scanned < max_scan); |
680 | spin_unlock(&zone->lru_lock); | 692 | spin_unlock(&zone->lru_lock); |
681 | done: | 693 | done: |
682 | local_irq_enable(); | 694 | local_irq_enable(); |
683 | pagevec_release(&pvec); | 695 | pagevec_release(&pvec); |
684 | return nr_reclaimed; | 696 | return nr_reclaimed; |
685 | } | 697 | } |
686 | 698 | ||
687 | /* | 699 | /* |
688 | * This moves pages from the active list to the inactive list. | 700 | * This moves pages from the active list to the inactive list. |
689 | * | 701 | * |
690 | * We move them the other way if the page is referenced by one or more | 702 | * We move them the other way if the page is referenced by one or more |
691 | * processes, from rmap. | 703 | * processes, from rmap. |
692 | * | 704 | * |
693 | * If the pages are mostly unmapped, the processing is fast and it is | 705 | * If the pages are mostly unmapped, the processing is fast and it is |
694 | * appropriate to hold zone->lru_lock across the whole operation. But if | 706 | * appropriate to hold zone->lru_lock across the whole operation. But if |
695 | * the pages are mapped, the processing is slow (page_referenced()) so we | 707 | * the pages are mapped, the processing is slow (page_referenced()) so we |
696 | * should drop zone->lru_lock around each page. It's impossible to balance | 708 | * should drop zone->lru_lock around each page. It's impossible to balance |
697 | * this, so instead we remove the pages from the LRU while processing them. | 709 | * this, so instead we remove the pages from the LRU while processing them. |
698 | * It is safe to rely on PG_active against the non-LRU pages in here because | 710 | * It is safe to rely on PG_active against the non-LRU pages in here because |
699 | * nobody will play with that bit on a non-LRU page. | 711 | * nobody will play with that bit on a non-LRU page. |
700 | * | 712 | * |
701 | * The downside is that we have to touch page->_count against each page. | 713 | * The downside is that we have to touch page->_count against each page. |
702 | * But we had to alter page->flags anyway. | 714 | * But we had to alter page->flags anyway. |
703 | */ | 715 | */ |
704 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 716 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
705 | struct scan_control *sc) | 717 | struct scan_control *sc) |
706 | { | 718 | { |
707 | unsigned long pgmoved; | 719 | unsigned long pgmoved; |
708 | int pgdeactivate = 0; | 720 | int pgdeactivate = 0; |
709 | unsigned long pgscanned; | 721 | unsigned long pgscanned; |
710 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 722 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
711 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 723 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ |
712 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | 724 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ |
713 | struct page *page; | 725 | struct page *page; |
714 | struct pagevec pvec; | 726 | struct pagevec pvec; |
715 | int reclaim_mapped = 0; | 727 | int reclaim_mapped = 0; |
716 | 728 | ||
717 | if (sc->may_swap) { | 729 | if (sc->may_swap) { |
718 | long mapped_ratio; | 730 | long mapped_ratio; |
719 | long distress; | 731 | long distress; |
720 | long swap_tendency; | 732 | long swap_tendency; |
721 | 733 | ||
722 | /* | 734 | /* |
723 | * `distress' is a measure of how much trouble we're having | 735 | * `distress' is a measure of how much trouble we're having |
724 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 736 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
725 | */ | 737 | */ |
726 | distress = 100 >> zone->prev_priority; | 738 | distress = 100 >> zone->prev_priority; |
727 | 739 | ||
728 | /* | 740 | /* |
729 | * The point of this algorithm is to decide when to start | 741 | * The point of this algorithm is to decide when to start |
730 | * reclaiming mapped memory instead of just pagecache. Work out | 742 | * reclaiming mapped memory instead of just pagecache. Work out |
731 | * how much memory | 743 | * how much memory |
732 | * is mapped. | 744 | * is mapped. |
733 | */ | 745 | */ |
734 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | 746 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; |
735 | 747 | ||
736 | /* | 748 | /* |
737 | * Now decide how much we really want to unmap some pages. The | 749 | * Now decide how much we really want to unmap some pages. The |
738 | * mapped ratio is downgraded - just because there's a lot of | 750 | * mapped ratio is downgraded - just because there's a lot of |
739 | * mapped memory doesn't necessarily mean that page reclaim | 751 | * mapped memory doesn't necessarily mean that page reclaim |
740 | * isn't succeeding. | 752 | * isn't succeeding. |
741 | * | 753 | * |
742 | * The distress ratio is important - we don't want to start | 754 | * The distress ratio is important - we don't want to start |
743 | * going oom. | 755 | * going oom. |
744 | * | 756 | * |
745 | * A 100% value of vm_swappiness overrides this algorithm | 757 | * A 100% value of vm_swappiness overrides this algorithm |
746 | * altogether. | 758 | * altogether. |
747 | */ | 759 | */ |
748 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | 760 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
749 | 761 | ||
750 | /* | 762 | /* |
751 | * Now use this metric to decide whether to start moving mapped | 763 | * Now use this metric to decide whether to start moving mapped |
752 | * memory onto the inactive list. | 764 | * memory onto the inactive list. |
753 | */ | 765 | */ |
754 | if (swap_tendency >= 100) | 766 | if (swap_tendency >= 100) |
755 | reclaim_mapped = 1; | 767 | reclaim_mapped = 1; |
756 | } | 768 | } |
757 | 769 | ||
758 | lru_add_drain(); | 770 | lru_add_drain(); |
759 | spin_lock_irq(&zone->lru_lock); | 771 | spin_lock_irq(&zone->lru_lock); |
760 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | 772 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, |
761 | &l_hold, &pgscanned); | 773 | &l_hold, &pgscanned); |
762 | zone->pages_scanned += pgscanned; | 774 | zone->pages_scanned += pgscanned; |
763 | zone->nr_active -= pgmoved; | 775 | zone->nr_active -= pgmoved; |
764 | spin_unlock_irq(&zone->lru_lock); | 776 | spin_unlock_irq(&zone->lru_lock); |
765 | 777 | ||
766 | while (!list_empty(&l_hold)) { | 778 | while (!list_empty(&l_hold)) { |
767 | cond_resched(); | 779 | cond_resched(); |
768 | page = lru_to_page(&l_hold); | 780 | page = lru_to_page(&l_hold); |
769 | list_del(&page->lru); | 781 | list_del(&page->lru); |
770 | if (page_mapped(page)) { | 782 | if (page_mapped(page)) { |
771 | if (!reclaim_mapped || | 783 | if (!reclaim_mapped || |
772 | (total_swap_pages == 0 && PageAnon(page)) || | 784 | (total_swap_pages == 0 && PageAnon(page)) || |
773 | page_referenced(page, 0)) { | 785 | page_referenced(page, 0)) { |
774 | list_add(&page->lru, &l_active); | 786 | list_add(&page->lru, &l_active); |
775 | continue; | 787 | continue; |
776 | } | 788 | } |
777 | } | 789 | } |
778 | list_add(&page->lru, &l_inactive); | 790 | list_add(&page->lru, &l_inactive); |
779 | } | 791 | } |
780 | 792 | ||
781 | pagevec_init(&pvec, 1); | 793 | pagevec_init(&pvec, 1); |
782 | pgmoved = 0; | 794 | pgmoved = 0; |
783 | spin_lock_irq(&zone->lru_lock); | 795 | spin_lock_irq(&zone->lru_lock); |
784 | while (!list_empty(&l_inactive)) { | 796 | while (!list_empty(&l_inactive)) { |
785 | page = lru_to_page(&l_inactive); | 797 | page = lru_to_page(&l_inactive); |
786 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 798 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
787 | BUG_ON(PageLRU(page)); | 799 | BUG_ON(PageLRU(page)); |
788 | SetPageLRU(page); | 800 | SetPageLRU(page); |
789 | BUG_ON(!PageActive(page)); | 801 | BUG_ON(!PageActive(page)); |
790 | ClearPageActive(page); | 802 | ClearPageActive(page); |
791 | 803 | ||
792 | list_move(&page->lru, &zone->inactive_list); | 804 | list_move(&page->lru, &zone->inactive_list); |
793 | pgmoved++; | 805 | pgmoved++; |
794 | if (!pagevec_add(&pvec, page)) { | 806 | if (!pagevec_add(&pvec, page)) { |
795 | zone->nr_inactive += pgmoved; | 807 | zone->nr_inactive += pgmoved; |
796 | spin_unlock_irq(&zone->lru_lock); | 808 | spin_unlock_irq(&zone->lru_lock); |
797 | pgdeactivate += pgmoved; | 809 | pgdeactivate += pgmoved; |
798 | pgmoved = 0; | 810 | pgmoved = 0; |
799 | if (buffer_heads_over_limit) | 811 | if (buffer_heads_over_limit) |
800 | pagevec_strip(&pvec); | 812 | pagevec_strip(&pvec); |
801 | __pagevec_release(&pvec); | 813 | __pagevec_release(&pvec); |
802 | spin_lock_irq(&zone->lru_lock); | 814 | spin_lock_irq(&zone->lru_lock); |
803 | } | 815 | } |
804 | } | 816 | } |
805 | zone->nr_inactive += pgmoved; | 817 | zone->nr_inactive += pgmoved; |
806 | pgdeactivate += pgmoved; | 818 | pgdeactivate += pgmoved; |
807 | if (buffer_heads_over_limit) { | 819 | if (buffer_heads_over_limit) { |
808 | spin_unlock_irq(&zone->lru_lock); | 820 | spin_unlock_irq(&zone->lru_lock); |
809 | pagevec_strip(&pvec); | 821 | pagevec_strip(&pvec); |
810 | spin_lock_irq(&zone->lru_lock); | 822 | spin_lock_irq(&zone->lru_lock); |
811 | } | 823 | } |
812 | 824 | ||
813 | pgmoved = 0; | 825 | pgmoved = 0; |
814 | while (!list_empty(&l_active)) { | 826 | while (!list_empty(&l_active)) { |
815 | page = lru_to_page(&l_active); | 827 | page = lru_to_page(&l_active); |
816 | prefetchw_prev_lru_page(page, &l_active, flags); | 828 | prefetchw_prev_lru_page(page, &l_active, flags); |
817 | BUG_ON(PageLRU(page)); | 829 | BUG_ON(PageLRU(page)); |
818 | SetPageLRU(page); | 830 | SetPageLRU(page); |
819 | BUG_ON(!PageActive(page)); | 831 | BUG_ON(!PageActive(page)); |
820 | list_move(&page->lru, &zone->active_list); | 832 | list_move(&page->lru, &zone->active_list); |
821 | pgmoved++; | 833 | pgmoved++; |
822 | if (!pagevec_add(&pvec, page)) { | 834 | if (!pagevec_add(&pvec, page)) { |
823 | zone->nr_active += pgmoved; | 835 | zone->nr_active += pgmoved; |
824 | pgmoved = 0; | 836 | pgmoved = 0; |
825 | spin_unlock_irq(&zone->lru_lock); | 837 | spin_unlock_irq(&zone->lru_lock); |
826 | __pagevec_release(&pvec); | 838 | __pagevec_release(&pvec); |
827 | spin_lock_irq(&zone->lru_lock); | 839 | spin_lock_irq(&zone->lru_lock); |
828 | } | 840 | } |
829 | } | 841 | } |
830 | zone->nr_active += pgmoved; | 842 | zone->nr_active += pgmoved; |
831 | spin_unlock(&zone->lru_lock); | 843 | spin_unlock(&zone->lru_lock); |
832 | 844 | ||
833 | __mod_page_state_zone(zone, pgrefill, pgscanned); | 845 | __mod_page_state_zone(zone, pgrefill, pgscanned); |
834 | __mod_page_state(pgdeactivate, pgdeactivate); | 846 | __mod_page_state(pgdeactivate, pgdeactivate); |
835 | local_irq_enable(); | 847 | local_irq_enable(); |
836 | 848 | ||
837 | pagevec_release(&pvec); | 849 | pagevec_release(&pvec); |
838 | } | 850 | } |
839 | 851 | ||
840 | /* | 852 | /* |
841 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 853 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
842 | */ | 854 | */ |
843 | static unsigned long shrink_zone(int priority, struct zone *zone, | 855 | static unsigned long shrink_zone(int priority, struct zone *zone, |
844 | struct scan_control *sc) | 856 | struct scan_control *sc) |
845 | { | 857 | { |
846 | unsigned long nr_active; | 858 | unsigned long nr_active; |
847 | unsigned long nr_inactive; | 859 | unsigned long nr_inactive; |
848 | unsigned long nr_to_scan; | 860 | unsigned long nr_to_scan; |
849 | unsigned long nr_reclaimed = 0; | 861 | unsigned long nr_reclaimed = 0; |
850 | 862 | ||
851 | atomic_inc(&zone->reclaim_in_progress); | 863 | atomic_inc(&zone->reclaim_in_progress); |
852 | 864 | ||
853 | /* | 865 | /* |
854 | * Add one to `nr_to_scan' just to make sure that the kernel will | 866 | * Add one to `nr_to_scan' just to make sure that the kernel will |
855 | * slowly sift through the active list. | 867 | * slowly sift through the active list. |
856 | */ | 868 | */ |
857 | zone->nr_scan_active += (zone->nr_active >> priority) + 1; | 869 | zone->nr_scan_active += (zone->nr_active >> priority) + 1; |
858 | nr_active = zone->nr_scan_active; | 870 | nr_active = zone->nr_scan_active; |
859 | if (nr_active >= sc->swap_cluster_max) | 871 | if (nr_active >= sc->swap_cluster_max) |
860 | zone->nr_scan_active = 0; | 872 | zone->nr_scan_active = 0; |
861 | else | 873 | else |
862 | nr_active = 0; | 874 | nr_active = 0; |
863 | 875 | ||
864 | zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; | 876 | zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; |
865 | nr_inactive = zone->nr_scan_inactive; | 877 | nr_inactive = zone->nr_scan_inactive; |
866 | if (nr_inactive >= sc->swap_cluster_max) | 878 | if (nr_inactive >= sc->swap_cluster_max) |
867 | zone->nr_scan_inactive = 0; | 879 | zone->nr_scan_inactive = 0; |
868 | else | 880 | else |
869 | nr_inactive = 0; | 881 | nr_inactive = 0; |
870 | 882 | ||
871 | while (nr_active || nr_inactive) { | 883 | while (nr_active || nr_inactive) { |
872 | if (nr_active) { | 884 | if (nr_active) { |
873 | nr_to_scan = min(nr_active, | 885 | nr_to_scan = min(nr_active, |
874 | (unsigned long)sc->swap_cluster_max); | 886 | (unsigned long)sc->swap_cluster_max); |
875 | nr_active -= nr_to_scan; | 887 | nr_active -= nr_to_scan; |
876 | shrink_active_list(nr_to_scan, zone, sc); | 888 | shrink_active_list(nr_to_scan, zone, sc); |
877 | } | 889 | } |
878 | 890 | ||
879 | if (nr_inactive) { | 891 | if (nr_inactive) { |
880 | nr_to_scan = min(nr_inactive, | 892 | nr_to_scan = min(nr_inactive, |
881 | (unsigned long)sc->swap_cluster_max); | 893 | (unsigned long)sc->swap_cluster_max); |
882 | nr_inactive -= nr_to_scan; | 894 | nr_inactive -= nr_to_scan; |
883 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, | 895 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, |
884 | sc); | 896 | sc); |
885 | } | 897 | } |
886 | } | 898 | } |
887 | 899 | ||
888 | throttle_vm_writeout(); | 900 | throttle_vm_writeout(); |
889 | 901 | ||
890 | atomic_dec(&zone->reclaim_in_progress); | 902 | atomic_dec(&zone->reclaim_in_progress); |
891 | return nr_reclaimed; | 903 | return nr_reclaimed; |
892 | } | 904 | } |
893 | 905 | ||
894 | /* | 906 | /* |
895 | * This is the direct reclaim path, for page-allocating processes. We only | 907 | * This is the direct reclaim path, for page-allocating processes. We only |
896 | * try to reclaim pages from zones which will satisfy the caller's allocation | 908 | * try to reclaim pages from zones which will satisfy the caller's allocation |
897 | * request. | 909 | * request. |
898 | * | 910 | * |
899 | * We reclaim from a zone even if that zone is over pages_high. Because: | 911 | * We reclaim from a zone even if that zone is over pages_high. Because: |
900 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 912 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
901 | * allocation or | 913 | * allocation or |
902 | * b) The zones may be over pages_high but they must go *over* pages_high to | 914 | * b) The zones may be over pages_high but they must go *over* pages_high to |
903 | * satisfy the `incremental min' zone defense algorithm. | 915 | * satisfy the `incremental min' zone defense algorithm. |
904 | * | 916 | * |
905 | * Returns the number of reclaimed pages. | 917 | * Returns the number of reclaimed pages. |
906 | * | 918 | * |
907 | * If a zone is deemed to be full of pinned pages then just give it a light | 919 | * If a zone is deemed to be full of pinned pages then just give it a light |
908 | * scan then give up on it. | 920 | * scan then give up on it. |
909 | */ | 921 | */ |
910 | static unsigned long shrink_zones(int priority, struct zone **zones, | 922 | static unsigned long shrink_zones(int priority, struct zone **zones, |
911 | struct scan_control *sc) | 923 | struct scan_control *sc) |
912 | { | 924 | { |
913 | unsigned long nr_reclaimed = 0; | 925 | unsigned long nr_reclaimed = 0; |
914 | int i; | 926 | int i; |
915 | 927 | ||
916 | for (i = 0; zones[i] != NULL; i++) { | 928 | for (i = 0; zones[i] != NULL; i++) { |
917 | struct zone *zone = zones[i]; | 929 | struct zone *zone = zones[i]; |
918 | 930 | ||
919 | if (!populated_zone(zone)) | 931 | if (!populated_zone(zone)) |
920 | continue; | 932 | continue; |
921 | 933 | ||
922 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 934 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
923 | continue; | 935 | continue; |
924 | 936 | ||
925 | zone->temp_priority = priority; | 937 | zone->temp_priority = priority; |
926 | if (zone->prev_priority > priority) | 938 | if (zone->prev_priority > priority) |
927 | zone->prev_priority = priority; | 939 | zone->prev_priority = priority; |
928 | 940 | ||
929 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
930 | continue; /* Let kswapd poll it */ | 942 | continue; /* Let kswapd poll it */ |
931 | 943 | ||
932 | nr_reclaimed += shrink_zone(priority, zone, sc); | 944 | nr_reclaimed += shrink_zone(priority, zone, sc); |
933 | } | 945 | } |
934 | return nr_reclaimed; | 946 | return nr_reclaimed; |
935 | } | 947 | } |
936 | 948 | ||
937 | /* | 949 | /* |
938 | * This is the main entry point to direct page reclaim. | 950 | * This is the main entry point to direct page reclaim. |
939 | * | 951 | * |
940 | * If a full scan of the inactive list fails to free enough memory then we | 952 | * If a full scan of the inactive list fails to free enough memory then we |
941 | * are "out of memory" and something needs to be killed. | 953 | * are "out of memory" and something needs to be killed. |
942 | * | 954 | * |
943 | * If the caller is !__GFP_FS then the probability of a failure is reasonably | 955 | * If the caller is !__GFP_FS then the probability of a failure is reasonably |
944 | * high - the zone may be full of dirty or under-writeback pages, which this | 956 | * high - the zone may be full of dirty or under-writeback pages, which this |
945 | * caller can't do much about. We kick pdflush and take explicit naps in the | 957 | * caller can't do much about. We kick pdflush and take explicit naps in the |
946 | * hope that some of these pages can be written. But if the allocating task | 958 | * hope that some of these pages can be written. But if the allocating task |
947 | * holds filesystem locks which prevent writeout this might not work, and the | 959 | * holds filesystem locks which prevent writeout this might not work, and the |
948 | * allocation attempt will fail. | 960 | * allocation attempt will fail. |
949 | */ | 961 | */ |
950 | unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | 962 | unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) |
951 | { | 963 | { |
952 | int priority; | 964 | int priority; |
953 | int ret = 0; | 965 | int ret = 0; |
954 | unsigned long total_scanned = 0; | 966 | unsigned long total_scanned = 0; |
955 | unsigned long nr_reclaimed = 0; | 967 | unsigned long nr_reclaimed = 0; |
956 | struct reclaim_state *reclaim_state = current->reclaim_state; | 968 | struct reclaim_state *reclaim_state = current->reclaim_state; |
957 | unsigned long lru_pages = 0; | 969 | unsigned long lru_pages = 0; |
958 | int i; | 970 | int i; |
959 | struct scan_control sc = { | 971 | struct scan_control sc = { |
960 | .gfp_mask = gfp_mask, | 972 | .gfp_mask = gfp_mask, |
961 | .may_writepage = !laptop_mode, | 973 | .may_writepage = !laptop_mode, |
962 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 974 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
963 | .may_swap = 1, | 975 | .may_swap = 1, |
964 | .swappiness = vm_swappiness, | 976 | .swappiness = vm_swappiness, |
965 | }; | 977 | }; |
966 | 978 | ||
967 | inc_page_state(allocstall); | 979 | inc_page_state(allocstall); |
968 | 980 | ||
969 | for (i = 0; zones[i] != NULL; i++) { | 981 | for (i = 0; zones[i] != NULL; i++) { |
970 | struct zone *zone = zones[i]; | 982 | struct zone *zone = zones[i]; |
971 | 983 | ||
972 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 984 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
973 | continue; | 985 | continue; |
974 | 986 | ||
975 | zone->temp_priority = DEF_PRIORITY; | 987 | zone->temp_priority = DEF_PRIORITY; |
976 | lru_pages += zone->nr_active + zone->nr_inactive; | 988 | lru_pages += zone->nr_active + zone->nr_inactive; |
977 | } | 989 | } |
978 | 990 | ||
979 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
980 | sc.nr_mapped = read_page_state(nr_mapped); | 992 | sc.nr_mapped = read_page_state(nr_mapped); |
981 | sc.nr_scanned = 0; | 993 | sc.nr_scanned = 0; |
982 | if (!priority) | 994 | if (!priority) |
983 | disable_swap_token(); | 995 | disable_swap_token(); |
984 | nr_reclaimed += shrink_zones(priority, zones, &sc); | 996 | nr_reclaimed += shrink_zones(priority, zones, &sc); |
985 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 997 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); |
986 | if (reclaim_state) { | 998 | if (reclaim_state) { |
987 | nr_reclaimed += reclaim_state->reclaimed_slab; | 999 | nr_reclaimed += reclaim_state->reclaimed_slab; |
988 | reclaim_state->reclaimed_slab = 0; | 1000 | reclaim_state->reclaimed_slab = 0; |
989 | } | 1001 | } |
990 | total_scanned += sc.nr_scanned; | 1002 | total_scanned += sc.nr_scanned; |
991 | if (nr_reclaimed >= sc.swap_cluster_max) { | 1003 | if (nr_reclaimed >= sc.swap_cluster_max) { |
992 | ret = 1; | 1004 | ret = 1; |
993 | goto out; | 1005 | goto out; |
994 | } | 1006 | } |
995 | 1007 | ||
996 | /* | 1008 | /* |
997 | * Try to write back as many pages as we just scanned. This | 1009 | * Try to write back as many pages as we just scanned. This |
998 | * tends to cause slow streaming writers to write data to the | 1010 | * tends to cause slow streaming writers to write data to the |
999 | * disk smoothly, at the dirtying rate, which is nice. But | 1011 | * disk smoothly, at the dirtying rate, which is nice. But |
1000 | * that's undesirable in laptop mode, where we *want* lumpy | 1012 | * that's undesirable in laptop mode, where we *want* lumpy |
1001 | * writeout. So in laptop mode, write out the whole world. | 1013 | * writeout. So in laptop mode, write out the whole world. |
1002 | */ | 1014 | */ |
1003 | if (total_scanned > sc.swap_cluster_max + | 1015 | if (total_scanned > sc.swap_cluster_max + |
1004 | sc.swap_cluster_max / 2) { | 1016 | sc.swap_cluster_max / 2) { |
1005 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1017 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1006 | sc.may_writepage = 1; | 1018 | sc.may_writepage = 1; |
1007 | } | 1019 | } |
1008 | 1020 | ||
1009 | /* Take a nap, wait for some writeback to complete */ | 1021 | /* Take a nap, wait for some writeback to complete */ |
1010 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1022 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
1011 | blk_congestion_wait(WRITE, HZ/10); | 1023 | blk_congestion_wait(WRITE, HZ/10); |
1012 | } | 1024 | } |
1013 | out: | 1025 | out: |
1014 | for (i = 0; zones[i] != 0; i++) { | 1026 | for (i = 0; zones[i] != 0; i++) { |
1015 | struct zone *zone = zones[i]; | 1027 | struct zone *zone = zones[i]; |
1016 | 1028 | ||
1017 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1029 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1018 | continue; | 1030 | continue; |
1019 | 1031 | ||
1020 | zone->prev_priority = zone->temp_priority; | 1032 | zone->prev_priority = zone->temp_priority; |
1021 | } | 1033 | } |
1022 | return ret; | 1034 | return ret; |
1023 | } | 1035 | } |
1024 | 1036 | ||
1025 | /* | 1037 | /* |
1026 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1038 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1027 | * they are all at pages_high. | 1039 | * they are all at pages_high. |
1028 | * | 1040 | * |
1029 | * Returns the number of pages which were actually freed. | 1041 | * Returns the number of pages which were actually freed. |
1030 | * | 1042 | * |
1031 | * There is special handling here for zones which are full of pinned pages. | 1043 | * There is special handling here for zones which are full of pinned pages. |
1032 | * This can happen if the pages are all mlocked, or if they are all used by | 1044 | * This can happen if the pages are all mlocked, or if they are all used by |
1033 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. | 1045 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. |
1034 | * What we do is to detect the case where all pages in the zone have been | 1046 | * What we do is to detect the case where all pages in the zone have been |
1035 | * scanned twice and there has been zero successful reclaim. Mark the zone as | 1047 | * scanned twice and there has been zero successful reclaim. Mark the zone as |
1036 | * dead and from now on, only perform a short scan. Basically we're polling | 1048 | * dead and from now on, only perform a short scan. Basically we're polling |
1037 | * the zone for when the problem goes away. | 1049 | * the zone for when the problem goes away. |
1038 | * | 1050 | * |
1039 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1051 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
1040 | * zones which have free_pages > pages_high, but once a zone is found to have | 1052 | * zones which have free_pages > pages_high, but once a zone is found to have |
1041 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1053 | * free_pages <= pages_high, we scan that zone and the lower zones regardless |
1042 | * of the number of free pages in the lower zones. This interoperates with | 1054 | * of the number of free pages in the lower zones. This interoperates with |
1043 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1055 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1044 | * across the zones. | 1056 | * across the zones. |
1045 | */ | 1057 | */ |
1046 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1058 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1047 | { | 1059 | { |
1048 | int all_zones_ok; | 1060 | int all_zones_ok; |
1049 | int priority; | 1061 | int priority; |
1050 | int i; | 1062 | int i; |
1051 | unsigned long total_scanned; | 1063 | unsigned long total_scanned; |
1052 | unsigned long nr_reclaimed; | 1064 | unsigned long nr_reclaimed; |
1053 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1065 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1054 | struct scan_control sc = { | 1066 | struct scan_control sc = { |
1055 | .gfp_mask = GFP_KERNEL, | 1067 | .gfp_mask = GFP_KERNEL, |
1056 | .may_swap = 1, | 1068 | .may_swap = 1, |
1057 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1069 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1058 | .swappiness = vm_swappiness, | 1070 | .swappiness = vm_swappiness, |
1059 | }; | 1071 | }; |
1060 | 1072 | ||
1061 | loop_again: | 1073 | loop_again: |
1062 | total_scanned = 0; | 1074 | total_scanned = 0; |
1063 | nr_reclaimed = 0; | 1075 | nr_reclaimed = 0; |
1064 | sc.may_writepage = !laptop_mode; | 1076 | sc.may_writepage = !laptop_mode; |
1065 | sc.nr_mapped = read_page_state(nr_mapped); | 1077 | sc.nr_mapped = read_page_state(nr_mapped); |
1066 | 1078 | ||
1067 | inc_page_state(pageoutrun); | 1079 | inc_page_state(pageoutrun); |
1068 | 1080 | ||
1069 | for (i = 0; i < pgdat->nr_zones; i++) { | 1081 | for (i = 0; i < pgdat->nr_zones; i++) { |
1070 | struct zone *zone = pgdat->node_zones + i; | 1082 | struct zone *zone = pgdat->node_zones + i; |
1071 | 1083 | ||
1072 | zone->temp_priority = DEF_PRIORITY; | 1084 | zone->temp_priority = DEF_PRIORITY; |
1073 | } | 1085 | } |
1074 | 1086 | ||
1075 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1087 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1076 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1088 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
1077 | unsigned long lru_pages = 0; | 1089 | unsigned long lru_pages = 0; |
1078 | 1090 | ||
1079 | /* The swap token gets in the way of swapout... */ | 1091 | /* The swap token gets in the way of swapout... */ |
1080 | if (!priority) | 1092 | if (!priority) |
1081 | disable_swap_token(); | 1093 | disable_swap_token(); |
1082 | 1094 | ||
1083 | all_zones_ok = 1; | 1095 | all_zones_ok = 1; |
1084 | 1096 | ||
1085 | /* | 1097 | /* |
1086 | * Scan in the highmem->dma direction for the highest | 1098 | * Scan in the highmem->dma direction for the highest |
1087 | * zone which needs scanning | 1099 | * zone which needs scanning |
1088 | */ | 1100 | */ |
1089 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1101 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | struct zone *zone = pgdat->node_zones + i; | 1102 | struct zone *zone = pgdat->node_zones + i; |
1091 | 1103 | ||
1092 | if (!populated_zone(zone)) | 1104 | if (!populated_zone(zone)) |
1093 | continue; | 1105 | continue; |
1094 | 1106 | ||
1095 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1107 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1096 | continue; | 1108 | continue; |
1097 | 1109 | ||
1098 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1110 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1099 | 0, 0)) { | 1111 | 0, 0)) { |
1100 | end_zone = i; | 1112 | end_zone = i; |
1101 | goto scan; | 1113 | goto scan; |
1102 | } | 1114 | } |
1103 | } | 1115 | } |
1104 | goto out; | 1116 | goto out; |
1105 | scan: | 1117 | scan: |
1106 | for (i = 0; i <= end_zone; i++) { | 1118 | for (i = 0; i <= end_zone; i++) { |
1107 | struct zone *zone = pgdat->node_zones + i; | 1119 | struct zone *zone = pgdat->node_zones + i; |
1108 | 1120 | ||
1109 | lru_pages += zone->nr_active + zone->nr_inactive; | 1121 | lru_pages += zone->nr_active + zone->nr_inactive; |
1110 | } | 1122 | } |
1111 | 1123 | ||
1112 | /* | 1124 | /* |
1113 | * Now scan the zone in the dma->highmem direction, stopping | 1125 | * Now scan the zone in the dma->highmem direction, stopping |
1114 | * at the last zone which needs scanning. | 1126 | * at the last zone which needs scanning. |
1115 | * | 1127 | * |
1116 | * We do this because the page allocator works in the opposite | 1128 | * We do this because the page allocator works in the opposite |
1117 | * direction. This prevents the page allocator from allocating | 1129 | * direction. This prevents the page allocator from allocating |
1118 | * pages behind kswapd's direction of progress, which would | 1130 | * pages behind kswapd's direction of progress, which would |
1119 | * cause too much scanning of the lower zones. | 1131 | * cause too much scanning of the lower zones. |
1120 | */ | 1132 | */ |
1121 | for (i = 0; i <= end_zone; i++) { | 1133 | for (i = 0; i <= end_zone; i++) { |
1122 | struct zone *zone = pgdat->node_zones + i; | 1134 | struct zone *zone = pgdat->node_zones + i; |
1123 | int nr_slab; | 1135 | int nr_slab; |
1124 | 1136 | ||
1125 | if (!populated_zone(zone)) | 1137 | if (!populated_zone(zone)) |
1126 | continue; | 1138 | continue; |
1127 | 1139 | ||
1128 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1140 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1129 | continue; | 1141 | continue; |
1130 | 1142 | ||
1131 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1143 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1132 | end_zone, 0)) | 1144 | end_zone, 0)) |
1133 | all_zones_ok = 0; | 1145 | all_zones_ok = 0; |
1134 | zone->temp_priority = priority; | 1146 | zone->temp_priority = priority; |
1135 | if (zone->prev_priority > priority) | 1147 | if (zone->prev_priority > priority) |
1136 | zone->prev_priority = priority; | 1148 | zone->prev_priority = priority; |
1137 | sc.nr_scanned = 0; | 1149 | sc.nr_scanned = 0; |
1138 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1150 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1139 | reclaim_state->reclaimed_slab = 0; | 1151 | reclaim_state->reclaimed_slab = 0; |
1140 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1152 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1141 | lru_pages); | 1153 | lru_pages); |
1142 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1154 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1143 | total_scanned += sc.nr_scanned; | 1155 | total_scanned += sc.nr_scanned; |
1144 | if (zone->all_unreclaimable) | 1156 | if (zone->all_unreclaimable) |
1145 | continue; | 1157 | continue; |
1146 | if (nr_slab == 0 && zone->pages_scanned >= | 1158 | if (nr_slab == 0 && zone->pages_scanned >= |
1147 | (zone->nr_active + zone->nr_inactive) * 4) | 1159 | (zone->nr_active + zone->nr_inactive) * 4) |
1148 | zone->all_unreclaimable = 1; | 1160 | zone->all_unreclaimable = 1; |
1149 | /* | 1161 | /* |
1150 | * If we've done a decent amount of scanning and | 1162 | * If we've done a decent amount of scanning and |
1151 | * the reclaim ratio is low, start doing writepage | 1163 | * the reclaim ratio is low, start doing writepage |
1152 | * even in laptop mode | 1164 | * even in laptop mode |
1153 | */ | 1165 | */ |
1154 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1166 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1155 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1167 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1156 | sc.may_writepage = 1; | 1168 | sc.may_writepage = 1; |
1157 | } | 1169 | } |
1158 | if (all_zones_ok) | 1170 | if (all_zones_ok) |
1159 | break; /* kswapd: all done */ | 1171 | break; /* kswapd: all done */ |
1160 | /* | 1172 | /* |
1161 | * OK, kswapd is getting into trouble. Take a nap, then take | 1173 | * OK, kswapd is getting into trouble. Take a nap, then take |
1162 | * another pass across the zones. | 1174 | * another pass across the zones. |
1163 | */ | 1175 | */ |
1164 | if (total_scanned && priority < DEF_PRIORITY - 2) | 1176 | if (total_scanned && priority < DEF_PRIORITY - 2) |
1165 | blk_congestion_wait(WRITE, HZ/10); | 1177 | blk_congestion_wait(WRITE, HZ/10); |
1166 | 1178 | ||
1167 | /* | 1179 | /* |
1168 | * We do this so kswapd doesn't build up large priorities for | 1180 | * We do this so kswapd doesn't build up large priorities for |
1169 | * example when it is freeing in parallel with allocators. It | 1181 | * example when it is freeing in parallel with allocators. It |
1170 | * matches the direct reclaim path behaviour in terms of impact | 1182 | * matches the direct reclaim path behaviour in terms of impact |
1171 | * on zone->*_priority. | 1183 | * on zone->*_priority. |
1172 | */ | 1184 | */ |
1173 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) | 1185 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1174 | break; | 1186 | break; |
1175 | } | 1187 | } |
1176 | out: | 1188 | out: |
1177 | for (i = 0; i < pgdat->nr_zones; i++) { | 1189 | for (i = 0; i < pgdat->nr_zones; i++) { |
1178 | struct zone *zone = pgdat->node_zones + i; | 1190 | struct zone *zone = pgdat->node_zones + i; |
1179 | 1191 | ||
1180 | zone->prev_priority = zone->temp_priority; | 1192 | zone->prev_priority = zone->temp_priority; |
1181 | } | 1193 | } |
1182 | if (!all_zones_ok) { | 1194 | if (!all_zones_ok) { |
1183 | cond_resched(); | 1195 | cond_resched(); |
1184 | goto loop_again; | 1196 | goto loop_again; |
1185 | } | 1197 | } |
1186 | 1198 | ||
1187 | return nr_reclaimed; | 1199 | return nr_reclaimed; |
1188 | } | 1200 | } |
1189 | 1201 | ||
1190 | /* | 1202 | /* |
1191 | * The background pageout daemon, started as a kernel thread | 1203 | * The background pageout daemon, started as a kernel thread |
1192 | * from the init process. | 1204 | * from the init process. |
1193 | * | 1205 | * |
1194 | * This basically trickles out pages so that we have _some_ | 1206 | * This basically trickles out pages so that we have _some_ |
1195 | * free memory available even if there is no other activity | 1207 | * free memory available even if there is no other activity |
1196 | * that frees anything up. This is needed for things like routing | 1208 | * that frees anything up. This is needed for things like routing |
1197 | * etc, where we otherwise might have all activity going on in | 1209 | * etc, where we otherwise might have all activity going on in |
1198 | * asynchronous contexts that cannot page things out. | 1210 | * asynchronous contexts that cannot page things out. |
1199 | * | 1211 | * |
1200 | * If there are applications that are active memory-allocators | 1212 | * If there are applications that are active memory-allocators |
1201 | * (most normal use), this basically shouldn't matter. | 1213 | * (most normal use), this basically shouldn't matter. |
1202 | */ | 1214 | */ |
1203 | static int kswapd(void *p) | 1215 | static int kswapd(void *p) |
1204 | { | 1216 | { |
1205 | unsigned long order; | 1217 | unsigned long order; |
1206 | pg_data_t *pgdat = (pg_data_t*)p; | 1218 | pg_data_t *pgdat = (pg_data_t*)p; |
1207 | struct task_struct *tsk = current; | 1219 | struct task_struct *tsk = current; |
1208 | DEFINE_WAIT(wait); | 1220 | DEFINE_WAIT(wait); |
1209 | struct reclaim_state reclaim_state = { | 1221 | struct reclaim_state reclaim_state = { |
1210 | .reclaimed_slab = 0, | 1222 | .reclaimed_slab = 0, |
1211 | }; | 1223 | }; |
1212 | cpumask_t cpumask; | 1224 | cpumask_t cpumask; |
1213 | 1225 | ||
1214 | daemonize("kswapd%d", pgdat->node_id); | 1226 | daemonize("kswapd%d", pgdat->node_id); |
1215 | cpumask = node_to_cpumask(pgdat->node_id); | 1227 | cpumask = node_to_cpumask(pgdat->node_id); |
1216 | if (!cpus_empty(cpumask)) | 1228 | if (!cpus_empty(cpumask)) |
1217 | set_cpus_allowed(tsk, cpumask); | 1229 | set_cpus_allowed(tsk, cpumask); |
1218 | current->reclaim_state = &reclaim_state; | 1230 | current->reclaim_state = &reclaim_state; |
1219 | 1231 | ||
1220 | /* | 1232 | /* |
1221 | * Tell the memory management that we're a "memory allocator", | 1233 | * Tell the memory management that we're a "memory allocator", |
1222 | * and that if we need more memory we should get access to it | 1234 | * and that if we need more memory we should get access to it |
1223 | * regardless (see "__alloc_pages()"). "kswapd" should | 1235 | * regardless (see "__alloc_pages()"). "kswapd" should |
1224 | * never get caught in the normal page freeing logic. | 1236 | * never get caught in the normal page freeing logic. |
1225 | * | 1237 | * |
1226 | * (Kswapd normally doesn't need memory anyway, but sometimes | 1238 | * (Kswapd normally doesn't need memory anyway, but sometimes |
1227 | * you need a small amount of memory in order to be able to | 1239 | * you need a small amount of memory in order to be able to |
1228 | * page out something else, and this flag essentially protects | 1240 | * page out something else, and this flag essentially protects |
1229 | * us from recursively trying to free more memory as we're | 1241 | * us from recursively trying to free more memory as we're |
1230 | * trying to free the first piece of memory in the first place). | 1242 | * trying to free the first piece of memory in the first place). |
1231 | */ | 1243 | */ |
1232 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 1244 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
1233 | 1245 | ||
1234 | order = 0; | 1246 | order = 0; |
1235 | for ( ; ; ) { | 1247 | for ( ; ; ) { |
1236 | unsigned long new_order; | 1248 | unsigned long new_order; |
1237 | 1249 | ||
1238 | try_to_freeze(); | 1250 | try_to_freeze(); |
1239 | 1251 | ||
1240 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 1252 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
1241 | new_order = pgdat->kswapd_max_order; | 1253 | new_order = pgdat->kswapd_max_order; |
1242 | pgdat->kswapd_max_order = 0; | 1254 | pgdat->kswapd_max_order = 0; |
1243 | if (order < new_order) { | 1255 | if (order < new_order) { |
1244 | /* | 1256 | /* |
1245 | * Don't sleep if someone wants a larger 'order' | 1257 | * Don't sleep if someone wants a larger 'order' |
1246 | * allocation | 1258 | * allocation |
1247 | */ | 1259 | */ |
1248 | order = new_order; | 1260 | order = new_order; |
1249 | } else { | 1261 | } else { |
1250 | schedule(); | 1262 | schedule(); |
1251 | order = pgdat->kswapd_max_order; | 1263 | order = pgdat->kswapd_max_order; |
1252 | } | 1264 | } |
1253 | finish_wait(&pgdat->kswapd_wait, &wait); | 1265 | finish_wait(&pgdat->kswapd_wait, &wait); |
1254 | 1266 | ||
1255 | balance_pgdat(pgdat, order); | 1267 | balance_pgdat(pgdat, order); |
1256 | } | 1268 | } |
1257 | return 0; | 1269 | return 0; |
1258 | } | 1270 | } |
1259 | 1271 | ||
1260 | /* | 1272 | /* |
1261 | * A zone is low on free memory, so wake its kswapd task to service it. | 1273 | * A zone is low on free memory, so wake its kswapd task to service it. |
1262 | */ | 1274 | */ |
1263 | void wakeup_kswapd(struct zone *zone, int order) | 1275 | void wakeup_kswapd(struct zone *zone, int order) |
1264 | { | 1276 | { |
1265 | pg_data_t *pgdat; | 1277 | pg_data_t *pgdat; |
1266 | 1278 | ||
1267 | if (!populated_zone(zone)) | 1279 | if (!populated_zone(zone)) |
1268 | return; | 1280 | return; |
1269 | 1281 | ||
1270 | pgdat = zone->zone_pgdat; | 1282 | pgdat = zone->zone_pgdat; |
1271 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 1283 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) |
1272 | return; | 1284 | return; |
1273 | if (pgdat->kswapd_max_order < order) | 1285 | if (pgdat->kswapd_max_order < order) |
1274 | pgdat->kswapd_max_order = order; | 1286 | pgdat->kswapd_max_order = order; |
1275 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1287 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1276 | return; | 1288 | return; |
1277 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 1289 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
1278 | return; | 1290 | return; |
1279 | wake_up_interruptible(&pgdat->kswapd_wait); | 1291 | wake_up_interruptible(&pgdat->kswapd_wait); |
1280 | } | 1292 | } |
1281 | 1293 | ||
1282 | #ifdef CONFIG_PM | 1294 | #ifdef CONFIG_PM |
1283 | /* | 1295 | /* |
1284 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 1296 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1285 | * from LRU lists system-wide, for given pass and priority, and returns the | 1297 | * from LRU lists system-wide, for given pass and priority, and returns the |
1286 | * number of reclaimed pages | 1298 | * number of reclaimed pages |
1287 | * | 1299 | * |
1288 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | 1300 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages |
1289 | */ | 1301 | */ |
1290 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | 1302 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, |
1291 | int prio, struct scan_control *sc) | 1303 | int prio, struct scan_control *sc) |
1292 | { | 1304 | { |
1293 | struct zone *zone; | 1305 | struct zone *zone; |
1294 | unsigned long nr_to_scan, ret = 0; | 1306 | unsigned long nr_to_scan, ret = 0; |
1295 | 1307 | ||
1296 | for_each_zone(zone) { | 1308 | for_each_zone(zone) { |
1297 | 1309 | ||
1298 | if (!populated_zone(zone)) | 1310 | if (!populated_zone(zone)) |
1299 | continue; | 1311 | continue; |
1300 | 1312 | ||
1301 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | 1313 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) |
1302 | continue; | 1314 | continue; |
1303 | 1315 | ||
1304 | /* For pass = 0 we don't shrink the active list */ | 1316 | /* For pass = 0 we don't shrink the active list */ |
1305 | if (pass > 0) { | 1317 | if (pass > 0) { |
1306 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | 1318 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; |
1307 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | 1319 | if (zone->nr_scan_active >= nr_pages || pass > 3) { |
1308 | zone->nr_scan_active = 0; | 1320 | zone->nr_scan_active = 0; |
1309 | nr_to_scan = min(nr_pages, zone->nr_active); | 1321 | nr_to_scan = min(nr_pages, zone->nr_active); |
1310 | shrink_active_list(nr_to_scan, zone, sc); | 1322 | shrink_active_list(nr_to_scan, zone, sc); |
1311 | } | 1323 | } |
1312 | } | 1324 | } |
1313 | 1325 | ||
1314 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | 1326 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; |
1315 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | 1327 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { |
1316 | zone->nr_scan_inactive = 0; | 1328 | zone->nr_scan_inactive = 0; |
1317 | nr_to_scan = min(nr_pages, zone->nr_inactive); | 1329 | nr_to_scan = min(nr_pages, zone->nr_inactive); |
1318 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | 1330 | ret += shrink_inactive_list(nr_to_scan, zone, sc); |
1319 | if (ret >= nr_pages) | 1331 | if (ret >= nr_pages) |
1320 | return ret; | 1332 | return ret; |
1321 | } | 1333 | } |
1322 | } | 1334 | } |
1323 | 1335 | ||
1324 | return ret; | 1336 | return ret; |
1325 | } | 1337 | } |
1326 | 1338 | ||
1327 | /* | 1339 | /* |
1328 | * Try to free `nr_pages' of memory, system-wide, and return the number of | 1340 | * Try to free `nr_pages' of memory, system-wide, and return the number of |
1329 | * freed pages. | 1341 | * freed pages. |
1330 | * | 1342 | * |
1331 | * Rather than trying to age LRUs the aim is to preserve the overall | 1343 | * Rather than trying to age LRUs the aim is to preserve the overall |
1332 | * LRU order by reclaiming preferentially | 1344 | * LRU order by reclaiming preferentially |
1333 | * inactive > active > active referenced > active mapped | 1345 | * inactive > active > active referenced > active mapped |
1334 | */ | 1346 | */ |
1335 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1347 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1336 | { | 1348 | { |
1337 | unsigned long lru_pages, nr_slab; | 1349 | unsigned long lru_pages, nr_slab; |
1338 | unsigned long ret = 0; | 1350 | unsigned long ret = 0; |
1339 | int pass; | 1351 | int pass; |
1340 | struct reclaim_state reclaim_state; | 1352 | struct reclaim_state reclaim_state; |
1341 | struct zone *zone; | 1353 | struct zone *zone; |
1342 | struct scan_control sc = { | 1354 | struct scan_control sc = { |
1343 | .gfp_mask = GFP_KERNEL, | 1355 | .gfp_mask = GFP_KERNEL, |
1344 | .may_swap = 0, | 1356 | .may_swap = 0, |
1345 | .swap_cluster_max = nr_pages, | 1357 | .swap_cluster_max = nr_pages, |
1346 | .may_writepage = 1, | 1358 | .may_writepage = 1, |
1347 | .swappiness = vm_swappiness, | 1359 | .swappiness = vm_swappiness, |
1348 | }; | 1360 | }; |
1349 | 1361 | ||
1350 | current->reclaim_state = &reclaim_state; | 1362 | current->reclaim_state = &reclaim_state; |
1351 | 1363 | ||
1352 | lru_pages = 0; | 1364 | lru_pages = 0; |
1353 | for_each_zone(zone) | 1365 | for_each_zone(zone) |
1354 | lru_pages += zone->nr_active + zone->nr_inactive; | 1366 | lru_pages += zone->nr_active + zone->nr_inactive; |
1355 | 1367 | ||
1356 | nr_slab = read_page_state(nr_slab); | 1368 | nr_slab = read_page_state(nr_slab); |
1357 | /* If slab caches are huge, it's better to hit them first */ | 1369 | /* If slab caches are huge, it's better to hit them first */ |
1358 | while (nr_slab >= lru_pages) { | 1370 | while (nr_slab >= lru_pages) { |
1359 | reclaim_state.reclaimed_slab = 0; | 1371 | reclaim_state.reclaimed_slab = 0; |
1360 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | 1372 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); |
1361 | if (!reclaim_state.reclaimed_slab) | 1373 | if (!reclaim_state.reclaimed_slab) |
1362 | break; | 1374 | break; |
1363 | 1375 | ||
1364 | ret += reclaim_state.reclaimed_slab; | 1376 | ret += reclaim_state.reclaimed_slab; |
1365 | if (ret >= nr_pages) | 1377 | if (ret >= nr_pages) |
1366 | goto out; | 1378 | goto out; |
1367 | 1379 | ||
1368 | nr_slab -= reclaim_state.reclaimed_slab; | 1380 | nr_slab -= reclaim_state.reclaimed_slab; |
1369 | } | 1381 | } |
1370 | 1382 | ||
1371 | /* | 1383 | /* |
1372 | * We try to shrink LRUs in 5 passes: | 1384 | * We try to shrink LRUs in 5 passes: |
1373 | * 0 = Reclaim from inactive_list only | 1385 | * 0 = Reclaim from inactive_list only |
1374 | * 1 = Reclaim from active list but don't reclaim mapped | 1386 | * 1 = Reclaim from active list but don't reclaim mapped |
1375 | * 2 = 2nd pass of type 1 | 1387 | * 2 = 2nd pass of type 1 |
1376 | * 3 = Reclaim mapped (normal reclaim) | 1388 | * 3 = Reclaim mapped (normal reclaim) |
1377 | * 4 = 2nd pass of type 3 | 1389 | * 4 = 2nd pass of type 3 |
1378 | */ | 1390 | */ |
1379 | for (pass = 0; pass < 5; pass++) { | 1391 | for (pass = 0; pass < 5; pass++) { |
1380 | int prio; | 1392 | int prio; |
1381 | 1393 | ||
1382 | /* Needed for shrinking slab caches later on */ | 1394 | /* Needed for shrinking slab caches later on */ |
1383 | if (!lru_pages) | 1395 | if (!lru_pages) |
1384 | for_each_zone(zone) { | 1396 | for_each_zone(zone) { |
1385 | lru_pages += zone->nr_active; | 1397 | lru_pages += zone->nr_active; |
1386 | lru_pages += zone->nr_inactive; | 1398 | lru_pages += zone->nr_inactive; |
1387 | } | 1399 | } |
1388 | 1400 | ||
1389 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | 1401 | /* Force reclaiming mapped pages in the passes #3 and #4 */ |
1390 | if (pass > 2) { | 1402 | if (pass > 2) { |
1391 | sc.may_swap = 1; | 1403 | sc.may_swap = 1; |
1392 | sc.swappiness = 100; | 1404 | sc.swappiness = 100; |
1393 | } | 1405 | } |
1394 | 1406 | ||
1395 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | 1407 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { |
1396 | unsigned long nr_to_scan = nr_pages - ret; | 1408 | unsigned long nr_to_scan = nr_pages - ret; |
1397 | 1409 | ||
1398 | sc.nr_mapped = read_page_state(nr_mapped); | 1410 | sc.nr_mapped = read_page_state(nr_mapped); |
1399 | sc.nr_scanned = 0; | 1411 | sc.nr_scanned = 0; |
1400 | 1412 | ||
1401 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | 1413 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); |
1402 | if (ret >= nr_pages) | 1414 | if (ret >= nr_pages) |
1403 | goto out; | 1415 | goto out; |
1404 | 1416 | ||
1405 | reclaim_state.reclaimed_slab = 0; | 1417 | reclaim_state.reclaimed_slab = 0; |
1406 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | 1418 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); |
1407 | ret += reclaim_state.reclaimed_slab; | 1419 | ret += reclaim_state.reclaimed_slab; |
1408 | if (ret >= nr_pages) | 1420 | if (ret >= nr_pages) |
1409 | goto out; | 1421 | goto out; |
1410 | 1422 | ||
1411 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 1423 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
1412 | blk_congestion_wait(WRITE, HZ / 10); | 1424 | blk_congestion_wait(WRITE, HZ / 10); |
1413 | } | 1425 | } |
1414 | 1426 | ||
1415 | lru_pages = 0; | 1427 | lru_pages = 0; |
1416 | } | 1428 | } |
1417 | 1429 | ||
1418 | /* | 1430 | /* |
1419 | * If ret = 0, we could not shrink LRUs, but there may be something | 1431 | * If ret = 0, we could not shrink LRUs, but there may be something |
1420 | * in slab caches | 1432 | * in slab caches |
1421 | */ | 1433 | */ |
1422 | if (!ret) | 1434 | if (!ret) |
1423 | do { | 1435 | do { |
1424 | reclaim_state.reclaimed_slab = 0; | 1436 | reclaim_state.reclaimed_slab = 0; |
1425 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | 1437 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); |
1426 | ret += reclaim_state.reclaimed_slab; | 1438 | ret += reclaim_state.reclaimed_slab; |
1427 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | 1439 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); |
1428 | 1440 | ||
1429 | out: | 1441 | out: |
1430 | current->reclaim_state = NULL; | 1442 | current->reclaim_state = NULL; |
1431 | 1443 | ||
1432 | return ret; | 1444 | return ret; |
1433 | } | 1445 | } |
1434 | #endif | 1446 | #endif |
1435 | 1447 | ||
1436 | #ifdef CONFIG_HOTPLUG_CPU | 1448 | #ifdef CONFIG_HOTPLUG_CPU |
1437 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 1449 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
1438 | not required for correctness. So if the last cpu in a node goes | 1450 | not required for correctness. So if the last cpu in a node goes |
1439 | away, we get changed to run anywhere: as the first one comes back, | 1451 | away, we get changed to run anywhere: as the first one comes back, |
1440 | restore their cpu bindings. */ | 1452 | restore their cpu bindings. */ |
1441 | static int cpu_callback(struct notifier_block *nfb, | 1453 | static int cpu_callback(struct notifier_block *nfb, |
1442 | unsigned long action, void *hcpu) | 1454 | unsigned long action, void *hcpu) |
1443 | { | 1455 | { |
1444 | pg_data_t *pgdat; | 1456 | pg_data_t *pgdat; |
1445 | cpumask_t mask; | 1457 | cpumask_t mask; |
1446 | 1458 | ||
1447 | if (action == CPU_ONLINE) { | 1459 | if (action == CPU_ONLINE) { |
1448 | for_each_online_pgdat(pgdat) { | 1460 | for_each_online_pgdat(pgdat) { |
1449 | mask = node_to_cpumask(pgdat->node_id); | 1461 | mask = node_to_cpumask(pgdat->node_id); |
1450 | if (any_online_cpu(mask) != NR_CPUS) | 1462 | if (any_online_cpu(mask) != NR_CPUS) |
1451 | /* One of our CPUs online: restore mask */ | 1463 | /* One of our CPUs online: restore mask */ |
1452 | set_cpus_allowed(pgdat->kswapd, mask); | 1464 | set_cpus_allowed(pgdat->kswapd, mask); |
1453 | } | 1465 | } |
1454 | } | 1466 | } |
1455 | return NOTIFY_OK; | 1467 | return NOTIFY_OK; |
1456 | } | 1468 | } |
1457 | #endif /* CONFIG_HOTPLUG_CPU */ | 1469 | #endif /* CONFIG_HOTPLUG_CPU */ |
1458 | 1470 | ||
1459 | static int __init kswapd_init(void) | 1471 | static int __init kswapd_init(void) |
1460 | { | 1472 | { |
1461 | pg_data_t *pgdat; | 1473 | pg_data_t *pgdat; |
1462 | 1474 | ||
1463 | swap_setup(); | 1475 | swap_setup(); |
1464 | for_each_online_pgdat(pgdat) { | 1476 | for_each_online_pgdat(pgdat) { |
1465 | pid_t pid; | 1477 | pid_t pid; |
1466 | 1478 | ||
1467 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | 1479 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); |
1468 | BUG_ON(pid < 0); | 1480 | BUG_ON(pid < 0); |
1469 | read_lock(&tasklist_lock); | 1481 | read_lock(&tasklist_lock); |
1470 | pgdat->kswapd = find_task_by_pid(pid); | 1482 | pgdat->kswapd = find_task_by_pid(pid); |
1471 | read_unlock(&tasklist_lock); | 1483 | read_unlock(&tasklist_lock); |
1472 | } | 1484 | } |
1473 | total_memory = nr_free_pagecache_pages(); | 1485 | total_memory = nr_free_pagecache_pages(); |
1474 | hotcpu_notifier(cpu_callback, 0); | 1486 | hotcpu_notifier(cpu_callback, 0); |
1475 | return 0; | 1487 | return 0; |
1476 | } | 1488 | } |
1477 | 1489 | ||
1478 | module_init(kswapd_init) | 1490 | module_init(kswapd_init) |
1479 | 1491 | ||
1480 | #ifdef CONFIG_NUMA | 1492 | #ifdef CONFIG_NUMA |
1481 | /* | 1493 | /* |
1482 | * Zone reclaim mode | 1494 | * Zone reclaim mode |
1483 | * | 1495 | * |
1484 | * If non-zero call zone_reclaim when the number of free pages falls below | 1496 | * If non-zero call zone_reclaim when the number of free pages falls below |
1485 | * the watermarks. | 1497 | * the watermarks. |
1486 | * | 1498 | * |
1487 | * In the future we may add flags to the mode. However, the page allocator | 1499 | * In the future we may add flags to the mode. However, the page allocator |
1488 | * should only have to check that zone_reclaim_mode != 0 before calling | 1500 | * should only have to check that zone_reclaim_mode != 0 before calling |
1489 | * zone_reclaim(). | 1501 | * zone_reclaim(). |
1490 | */ | 1502 | */ |
1491 | int zone_reclaim_mode __read_mostly; | 1503 | int zone_reclaim_mode __read_mostly; |
1492 | 1504 | ||
1493 | #define RECLAIM_OFF 0 | 1505 | #define RECLAIM_OFF 0 |
1494 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1506 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ |
1495 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1507 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1496 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1508 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1497 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | 1509 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ |
1498 | 1510 | ||
1499 | /* | 1511 | /* |
1500 | * Mininum time between zone reclaim scans | 1512 | * Mininum time between zone reclaim scans |
1501 | */ | 1513 | */ |
1502 | int zone_reclaim_interval __read_mostly = 30*HZ; | 1514 | int zone_reclaim_interval __read_mostly = 30*HZ; |
1503 | 1515 | ||
1504 | /* | 1516 | /* |
1505 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1517 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
1506 | * of a node considered for each zone_reclaim. 4 scans 1/16th of | 1518 | * of a node considered for each zone_reclaim. 4 scans 1/16th of |
1507 | * a zone. | 1519 | * a zone. |
1508 | */ | 1520 | */ |
1509 | #define ZONE_RECLAIM_PRIORITY 4 | 1521 | #define ZONE_RECLAIM_PRIORITY 4 |
1510 | 1522 | ||
1511 | /* | 1523 | /* |
1512 | * Try to free up some pages from this zone through reclaim. | 1524 | * Try to free up some pages from this zone through reclaim. |
1513 | */ | 1525 | */ |
1514 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1526 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1515 | { | 1527 | { |
1516 | /* Minimum pages needed in order to stay on node */ | 1528 | /* Minimum pages needed in order to stay on node */ |
1517 | const unsigned long nr_pages = 1 << order; | 1529 | const unsigned long nr_pages = 1 << order; |
1518 | struct task_struct *p = current; | 1530 | struct task_struct *p = current; |
1519 | struct reclaim_state reclaim_state; | 1531 | struct reclaim_state reclaim_state; |
1520 | int priority; | 1532 | int priority; |
1521 | unsigned long nr_reclaimed = 0; | 1533 | unsigned long nr_reclaimed = 0; |
1522 | struct scan_control sc = { | 1534 | struct scan_control sc = { |
1523 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 1535 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
1524 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 1536 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1525 | .nr_mapped = read_page_state(nr_mapped), | 1537 | .nr_mapped = read_page_state(nr_mapped), |
1526 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1538 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1527 | SWAP_CLUSTER_MAX), | 1539 | SWAP_CLUSTER_MAX), |
1528 | .gfp_mask = gfp_mask, | 1540 | .gfp_mask = gfp_mask, |
1529 | .swappiness = vm_swappiness, | 1541 | .swappiness = vm_swappiness, |
1530 | }; | 1542 | }; |
1531 | 1543 | ||
1532 | disable_swap_token(); | 1544 | disable_swap_token(); |
1533 | cond_resched(); | 1545 | cond_resched(); |
1534 | /* | 1546 | /* |
1535 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 1547 | * We need to be able to allocate from the reserves for RECLAIM_SWAP |
1536 | * and we also need to be able to write out pages for RECLAIM_WRITE | 1548 | * and we also need to be able to write out pages for RECLAIM_WRITE |
1537 | * and RECLAIM_SWAP. | 1549 | * and RECLAIM_SWAP. |
1538 | */ | 1550 | */ |
1539 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 1551 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
1540 | reclaim_state.reclaimed_slab = 0; | 1552 | reclaim_state.reclaimed_slab = 0; |
1541 | p->reclaim_state = &reclaim_state; | 1553 | p->reclaim_state = &reclaim_state; |
1542 | 1554 | ||
1543 | /* | 1555 | /* |
1544 | * Free memory by calling shrink zone with increasing priorities | 1556 | * Free memory by calling shrink zone with increasing priorities |
1545 | * until we have enough memory freed. | 1557 | * until we have enough memory freed. |
1546 | */ | 1558 | */ |
1547 | priority = ZONE_RECLAIM_PRIORITY; | 1559 | priority = ZONE_RECLAIM_PRIORITY; |
1548 | do { | 1560 | do { |
1549 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1561 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1550 | priority--; | 1562 | priority--; |
1551 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1563 | } while (priority >= 0 && nr_reclaimed < nr_pages); |
1552 | 1564 | ||
1553 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1565 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { |
1554 | /* | 1566 | /* |
1555 | * shrink_slab() does not currently allow us to determine how | 1567 | * shrink_slab() does not currently allow us to determine how |
1556 | * many pages were freed in this zone. So we just shake the slab | 1568 | * many pages were freed in this zone. So we just shake the slab |
1557 | * a bit and then go off node for this particular allocation | 1569 | * a bit and then go off node for this particular allocation |
1558 | * despite possibly having freed enough memory to allocate in | 1570 | * despite possibly having freed enough memory to allocate in |
1559 | * this zone. If we freed local memory then the next | 1571 | * this zone. If we freed local memory then the next |
1560 | * allocations will be local again. | 1572 | * allocations will be local again. |
1561 | * | 1573 | * |
1562 | * shrink_slab will free memory on all zones and may take | 1574 | * shrink_slab will free memory on all zones and may take |
1563 | * a long time. | 1575 | * a long time. |
1564 | */ | 1576 | */ |
1565 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1577 | shrink_slab(sc.nr_scanned, gfp_mask, order); |
1566 | } | 1578 | } |
1567 | 1579 | ||
1568 | p->reclaim_state = NULL; | 1580 | p->reclaim_state = NULL; |
1569 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1581 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1570 | 1582 | ||
1571 | if (nr_reclaimed == 0) { | 1583 | if (nr_reclaimed == 0) { |
1572 | /* | 1584 | /* |
1573 | * We were unable to reclaim enough pages to stay on node. We | 1585 | * We were unable to reclaim enough pages to stay on node. We |
1574 | * now allow off node accesses for a certain time period before | 1586 | * now allow off node accesses for a certain time period before |
1575 | * trying again to reclaim pages from the local zone. | 1587 | * trying again to reclaim pages from the local zone. |
1576 | */ | 1588 | */ |
1577 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1589 | zone->last_unsuccessful_zone_reclaim = jiffies; |
1578 | } | 1590 | } |
1579 | 1591 | ||
1580 | return nr_reclaimed >= nr_pages; | 1592 | return nr_reclaimed >= nr_pages; |
1581 | } | 1593 | } |
1582 | 1594 | ||
1583 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1595 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1584 | { | 1596 | { |
1585 | cpumask_t mask; | 1597 | cpumask_t mask; |
1586 | int node_id; | 1598 | int node_id; |
1587 | 1599 | ||
1588 | /* | 1600 | /* |
1589 | * Do not reclaim if there was a recent unsuccessful attempt at zone | 1601 | * Do not reclaim if there was a recent unsuccessful attempt at zone |
1590 | * reclaim. In that case we let allocations go off node for the | 1602 | * reclaim. In that case we let allocations go off node for the |
1591 | * zone_reclaim_interval. Otherwise we would scan for each off-node | 1603 | * zone_reclaim_interval. Otherwise we would scan for each off-node |
1592 | * page allocation. | 1604 | * page allocation. |
1593 | */ | 1605 | */ |
1594 | if (time_before(jiffies, | 1606 | if (time_before(jiffies, |
1595 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1607 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) |
1596 | return 0; | 1608 | return 0; |
1597 | 1609 | ||
1598 | /* | 1610 | /* |
1599 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 1611 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does |
1600 | * not have reclaimable pages and if we should not delay the allocation | 1612 | * not have reclaimable pages and if we should not delay the allocation |
1601 | * then do not scan. | 1613 | * then do not scan. |
1602 | */ | 1614 | */ |
1603 | if (!(gfp_mask & __GFP_WAIT) || | 1615 | if (!(gfp_mask & __GFP_WAIT) || |
1604 | zone->all_unreclaimable || | 1616 | zone->all_unreclaimable || |
1605 | atomic_read(&zone->reclaim_in_progress) > 0 || | 1617 | atomic_read(&zone->reclaim_in_progress) > 0 || |
1606 | (current->flags & PF_MEMALLOC)) | 1618 | (current->flags & PF_MEMALLOC)) |
1607 | return 0; | 1619 | return 0; |
1608 | 1620 | ||
1609 | /* | 1621 | /* |
1610 | * Only run zone reclaim on the local zone or on zones that do not | 1622 | * Only run zone reclaim on the local zone or on zones that do not |
1611 | * have associated processors. This will favor the local processor | 1623 | * have associated processors. This will favor the local processor |
1612 | * over remote processors and spread off node memory allocations | 1624 | * over remote processors and spread off node memory allocations |
1613 | * as wide as possible. | 1625 | * as wide as possible. |
1614 | */ | 1626 | */ |
1615 | node_id = zone->zone_pgdat->node_id; | 1627 | node_id = zone->zone_pgdat->node_id; |
1616 | mask = node_to_cpumask(node_id); | 1628 | mask = node_to_cpumask(node_id); |
1617 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1629 | if (!cpus_empty(mask) && node_id != numa_node_id()) |
1618 | return 0; | 1630 | return 0; |
1619 | return __zone_reclaim(zone, gfp_mask, order); | 1631 | return __zone_reclaim(zone, gfp_mask, order); |
1620 | } | 1632 | } |
1621 | #endif | 1633 | #endif |
1622 | 1634 |