Commit 7db8889ab05b57200158432755af318fb68854a2

Authored by Rik van Riel
Committed by Linus Torvalds
1 parent ab21588487

mm: have order > 0 compaction start off where it left

Order > 0 compaction stops when enough free pages of the correct page
order have been coalesced.  When doing subsequent higher order
allocations, it is possible for compaction to be invoked many times.

However, the compaction code always starts out looking for things to
compact at the start of the zone, and for free pages to compact things to
at the end of the zone.

This can cause quadratic behaviour, with isolate_freepages starting at the
end of the zone each time, even though previous invocations of the
compaction code already filled up all free memory on that end of the zone.

This can cause isolate_freepages to take enormous amounts of CPU with
certain workloads on larger memory systems.

The obvious solution is to have isolate_freepages remember where it left
off last time, and continue at that point the next time it gets invoked
for an order > 0 compaction.  This could cause compaction to fail if
cc->free_pfn and cc->migrate_pfn are close together initially, in that
case we restart from the end of the zone and try once more.

Forced full (order == -1) compactions are left alone.

[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: s/laste/last/, use 80 cols]
Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Jim Schutt <jaschut@sandia.gov>
Tested-by: Jim Schutt <jaschut@sandia.gov>
Cc: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 73 additions and 5 deletions Inline Diff

include/linux/mmzone.h
1 #ifndef _LINUX_MMZONE_H 1 #ifndef _LINUX_MMZONE_H
2 #define _LINUX_MMZONE_H 2 #define _LINUX_MMZONE_H
3 3
4 #ifndef __ASSEMBLY__ 4 #ifndef __ASSEMBLY__
5 #ifndef __GENERATING_BOUNDS_H 5 #ifndef __GENERATING_BOUNDS_H
6 6
7 #include <linux/spinlock.h> 7 #include <linux/spinlock.h>
8 #include <linux/list.h> 8 #include <linux/list.h>
9 #include <linux/wait.h> 9 #include <linux/wait.h>
10 #include <linux/bitops.h> 10 #include <linux/bitops.h>
11 #include <linux/cache.h> 11 #include <linux/cache.h>
12 #include <linux/threads.h> 12 #include <linux/threads.h>
13 #include <linux/numa.h> 13 #include <linux/numa.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <linux/seqlock.h> 15 #include <linux/seqlock.h>
16 #include <linux/nodemask.h> 16 #include <linux/nodemask.h>
17 #include <linux/pageblock-flags.h> 17 #include <linux/pageblock-flags.h>
18 #include <generated/bounds.h> 18 #include <generated/bounds.h>
19 #include <linux/atomic.h> 19 #include <linux/atomic.h>
20 #include <asm/page.h> 20 #include <asm/page.h>
21 21
22 /* Free memory management - zoned buddy allocator. */ 22 /* Free memory management - zoned buddy allocator. */
23 #ifndef CONFIG_FORCE_MAX_ZONEORDER 23 #ifndef CONFIG_FORCE_MAX_ZONEORDER
24 #define MAX_ORDER 11 24 #define MAX_ORDER 11
25 #else 25 #else
26 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 26 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
27 #endif 27 #endif
28 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) 28 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
29 29
30 /* 30 /*
31 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed 31 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
32 * costly to service. That is between allocation orders which should 32 * costly to service. That is between allocation orders which should
33 * coalesce naturally under reasonable reclaim pressure and those which 33 * coalesce naturally under reasonable reclaim pressure and those which
34 * will not. 34 * will not.
35 */ 35 */
36 #define PAGE_ALLOC_COSTLY_ORDER 3 36 #define PAGE_ALLOC_COSTLY_ORDER 3
37 37
38 enum { 38 enum {
39 MIGRATE_UNMOVABLE, 39 MIGRATE_UNMOVABLE,
40 MIGRATE_RECLAIMABLE, 40 MIGRATE_RECLAIMABLE,
41 MIGRATE_MOVABLE, 41 MIGRATE_MOVABLE,
42 MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ 42 MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
43 MIGRATE_RESERVE = MIGRATE_PCPTYPES, 43 MIGRATE_RESERVE = MIGRATE_PCPTYPES,
44 #ifdef CONFIG_CMA 44 #ifdef CONFIG_CMA
45 /* 45 /*
46 * MIGRATE_CMA migration type is designed to mimic the way 46 * MIGRATE_CMA migration type is designed to mimic the way
47 * ZONE_MOVABLE works. Only movable pages can be allocated 47 * ZONE_MOVABLE works. Only movable pages can be allocated
48 * from MIGRATE_CMA pageblocks and page allocator never 48 * from MIGRATE_CMA pageblocks and page allocator never
49 * implicitly change migration type of MIGRATE_CMA pageblock. 49 * implicitly change migration type of MIGRATE_CMA pageblock.
50 * 50 *
51 * The way to use it is to change migratetype of a range of 51 * The way to use it is to change migratetype of a range of
52 * pageblocks to MIGRATE_CMA which can be done by 52 * pageblocks to MIGRATE_CMA which can be done by
53 * __free_pageblock_cma() function. What is important though 53 * __free_pageblock_cma() function. What is important though
54 * is that a range of pageblocks must be aligned to 54 * is that a range of pageblocks must be aligned to
55 * MAX_ORDER_NR_PAGES should biggest page be bigger then 55 * MAX_ORDER_NR_PAGES should biggest page be bigger then
56 * a single pageblock. 56 * a single pageblock.
57 */ 57 */
58 MIGRATE_CMA, 58 MIGRATE_CMA,
59 #endif 59 #endif
60 MIGRATE_ISOLATE, /* can't allocate from here */ 60 MIGRATE_ISOLATE, /* can't allocate from here */
61 MIGRATE_TYPES 61 MIGRATE_TYPES
62 }; 62 };
63 63
64 #ifdef CONFIG_CMA 64 #ifdef CONFIG_CMA
65 # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) 65 # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
66 # define cma_wmark_pages(zone) zone->min_cma_pages 66 # define cma_wmark_pages(zone) zone->min_cma_pages
67 #else 67 #else
68 # define is_migrate_cma(migratetype) false 68 # define is_migrate_cma(migratetype) false
69 # define cma_wmark_pages(zone) 0 69 # define cma_wmark_pages(zone) 0
70 #endif 70 #endif
71 71
72 #define for_each_migratetype_order(order, type) \ 72 #define for_each_migratetype_order(order, type) \
73 for (order = 0; order < MAX_ORDER; order++) \ 73 for (order = 0; order < MAX_ORDER; order++) \
74 for (type = 0; type < MIGRATE_TYPES; type++) 74 for (type = 0; type < MIGRATE_TYPES; type++)
75 75
76 extern int page_group_by_mobility_disabled; 76 extern int page_group_by_mobility_disabled;
77 77
78 static inline int get_pageblock_migratetype(struct page *page) 78 static inline int get_pageblock_migratetype(struct page *page)
79 { 79 {
80 return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); 80 return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
81 } 81 }
82 82
83 struct free_area { 83 struct free_area {
84 struct list_head free_list[MIGRATE_TYPES]; 84 struct list_head free_list[MIGRATE_TYPES];
85 unsigned long nr_free; 85 unsigned long nr_free;
86 }; 86 };
87 87
88 struct pglist_data; 88 struct pglist_data;
89 89
90 /* 90 /*
91 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 91 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
92 * So add a wild amount of padding here to ensure that they fall into separate 92 * So add a wild amount of padding here to ensure that they fall into separate
93 * cachelines. There are very few zone structures in the machine, so space 93 * cachelines. There are very few zone structures in the machine, so space
94 * consumption is not a concern here. 94 * consumption is not a concern here.
95 */ 95 */
96 #if defined(CONFIG_SMP) 96 #if defined(CONFIG_SMP)
97 struct zone_padding { 97 struct zone_padding {
98 char x[0]; 98 char x[0];
99 } ____cacheline_internodealigned_in_smp; 99 } ____cacheline_internodealigned_in_smp;
100 #define ZONE_PADDING(name) struct zone_padding name; 100 #define ZONE_PADDING(name) struct zone_padding name;
101 #else 101 #else
102 #define ZONE_PADDING(name) 102 #define ZONE_PADDING(name)
103 #endif 103 #endif
104 104
105 enum zone_stat_item { 105 enum zone_stat_item {
106 /* First 128 byte cacheline (assuming 64 bit words) */ 106 /* First 128 byte cacheline (assuming 64 bit words) */
107 NR_FREE_PAGES, 107 NR_FREE_PAGES,
108 NR_LRU_BASE, 108 NR_LRU_BASE,
109 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ 109 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
110 NR_ACTIVE_ANON, /* " " " " " */ 110 NR_ACTIVE_ANON, /* " " " " " */
111 NR_INACTIVE_FILE, /* " " " " " */ 111 NR_INACTIVE_FILE, /* " " " " " */
112 NR_ACTIVE_FILE, /* " " " " " */ 112 NR_ACTIVE_FILE, /* " " " " " */
113 NR_UNEVICTABLE, /* " " " " " */ 113 NR_UNEVICTABLE, /* " " " " " */
114 NR_MLOCK, /* mlock()ed pages found and moved off LRU */ 114 NR_MLOCK, /* mlock()ed pages found and moved off LRU */
115 NR_ANON_PAGES, /* Mapped anonymous pages */ 115 NR_ANON_PAGES, /* Mapped anonymous pages */
116 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. 116 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
117 only modified from process context */ 117 only modified from process context */
118 NR_FILE_PAGES, 118 NR_FILE_PAGES,
119 NR_FILE_DIRTY, 119 NR_FILE_DIRTY,
120 NR_WRITEBACK, 120 NR_WRITEBACK,
121 NR_SLAB_RECLAIMABLE, 121 NR_SLAB_RECLAIMABLE,
122 NR_SLAB_UNRECLAIMABLE, 122 NR_SLAB_UNRECLAIMABLE,
123 NR_PAGETABLE, /* used for pagetables */ 123 NR_PAGETABLE, /* used for pagetables */
124 NR_KERNEL_STACK, 124 NR_KERNEL_STACK,
125 /* Second 128 byte cacheline */ 125 /* Second 128 byte cacheline */
126 NR_UNSTABLE_NFS, /* NFS unstable pages */ 126 NR_UNSTABLE_NFS, /* NFS unstable pages */
127 NR_BOUNCE, 127 NR_BOUNCE,
128 NR_VMSCAN_WRITE, 128 NR_VMSCAN_WRITE,
129 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ 129 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
130 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ 130 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
131 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ 131 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
132 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ 132 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
133 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 133 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
134 NR_DIRTIED, /* page dirtyings since bootup */ 134 NR_DIRTIED, /* page dirtyings since bootup */
135 NR_WRITTEN, /* page writings since bootup */ 135 NR_WRITTEN, /* page writings since bootup */
136 #ifdef CONFIG_NUMA 136 #ifdef CONFIG_NUMA
137 NUMA_HIT, /* allocated in intended node */ 137 NUMA_HIT, /* allocated in intended node */
138 NUMA_MISS, /* allocated in non intended node */ 138 NUMA_MISS, /* allocated in non intended node */
139 NUMA_FOREIGN, /* was intended here, hit elsewhere */ 139 NUMA_FOREIGN, /* was intended here, hit elsewhere */
140 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ 140 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
141 NUMA_LOCAL, /* allocation from local node */ 141 NUMA_LOCAL, /* allocation from local node */
142 NUMA_OTHER, /* allocation from other node */ 142 NUMA_OTHER, /* allocation from other node */
143 #endif 143 #endif
144 NR_ANON_TRANSPARENT_HUGEPAGES, 144 NR_ANON_TRANSPARENT_HUGEPAGES,
145 NR_VM_ZONE_STAT_ITEMS }; 145 NR_VM_ZONE_STAT_ITEMS };
146 146
147 /* 147 /*
148 * We do arithmetic on the LRU lists in various places in the code, 148 * We do arithmetic on the LRU lists in various places in the code,
149 * so it is important to keep the active lists LRU_ACTIVE higher in 149 * so it is important to keep the active lists LRU_ACTIVE higher in
150 * the array than the corresponding inactive lists, and to keep 150 * the array than the corresponding inactive lists, and to keep
151 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. 151 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
152 * 152 *
153 * This has to be kept in sync with the statistics in zone_stat_item 153 * This has to be kept in sync with the statistics in zone_stat_item
154 * above and the descriptions in vmstat_text in mm/vmstat.c 154 * above and the descriptions in vmstat_text in mm/vmstat.c
155 */ 155 */
156 #define LRU_BASE 0 156 #define LRU_BASE 0
157 #define LRU_ACTIVE 1 157 #define LRU_ACTIVE 1
158 #define LRU_FILE 2 158 #define LRU_FILE 2
159 159
160 enum lru_list { 160 enum lru_list {
161 LRU_INACTIVE_ANON = LRU_BASE, 161 LRU_INACTIVE_ANON = LRU_BASE,
162 LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, 162 LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
163 LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, 163 LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
164 LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, 164 LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
165 LRU_UNEVICTABLE, 165 LRU_UNEVICTABLE,
166 NR_LRU_LISTS 166 NR_LRU_LISTS
167 }; 167 };
168 168
169 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) 169 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
170 170
171 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) 171 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
172 172
173 static inline int is_file_lru(enum lru_list lru) 173 static inline int is_file_lru(enum lru_list lru)
174 { 174 {
175 return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); 175 return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
176 } 176 }
177 177
178 static inline int is_active_lru(enum lru_list lru) 178 static inline int is_active_lru(enum lru_list lru)
179 { 179 {
180 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); 180 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
181 } 181 }
182 182
183 static inline int is_unevictable_lru(enum lru_list lru) 183 static inline int is_unevictable_lru(enum lru_list lru)
184 { 184 {
185 return (lru == LRU_UNEVICTABLE); 185 return (lru == LRU_UNEVICTABLE);
186 } 186 }
187 187
188 struct zone_reclaim_stat { 188 struct zone_reclaim_stat {
189 /* 189 /*
190 * The pageout code in vmscan.c keeps track of how many of the 190 * The pageout code in vmscan.c keeps track of how many of the
191 * mem/swap backed and file backed pages are referenced. 191 * mem/swap backed and file backed pages are referenced.
192 * The higher the rotated/scanned ratio, the more valuable 192 * The higher the rotated/scanned ratio, the more valuable
193 * that cache is. 193 * that cache is.
194 * 194 *
195 * The anon LRU stats live in [0], file LRU stats in [1] 195 * The anon LRU stats live in [0], file LRU stats in [1]
196 */ 196 */
197 unsigned long recent_rotated[2]; 197 unsigned long recent_rotated[2];
198 unsigned long recent_scanned[2]; 198 unsigned long recent_scanned[2];
199 }; 199 };
200 200
201 struct lruvec { 201 struct lruvec {
202 struct list_head lists[NR_LRU_LISTS]; 202 struct list_head lists[NR_LRU_LISTS];
203 struct zone_reclaim_stat reclaim_stat; 203 struct zone_reclaim_stat reclaim_stat;
204 #ifdef CONFIG_MEMCG 204 #ifdef CONFIG_MEMCG
205 struct zone *zone; 205 struct zone *zone;
206 #endif 206 #endif
207 }; 207 };
208 208
209 /* Mask used at gathering information at once (see memcontrol.c) */ 209 /* Mask used at gathering information at once (see memcontrol.c) */
210 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 210 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
211 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 211 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
212 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 212 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
213 213
214 /* Isolate clean file */ 214 /* Isolate clean file */
215 #define ISOLATE_CLEAN ((__force isolate_mode_t)0x1) 215 #define ISOLATE_CLEAN ((__force isolate_mode_t)0x1)
216 /* Isolate unmapped file */ 216 /* Isolate unmapped file */
217 #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) 217 #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
218 /* Isolate for asynchronous migration */ 218 /* Isolate for asynchronous migration */
219 #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) 219 #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
220 220
221 /* LRU Isolation modes. */ 221 /* LRU Isolation modes. */
222 typedef unsigned __bitwise__ isolate_mode_t; 222 typedef unsigned __bitwise__ isolate_mode_t;
223 223
224 enum zone_watermarks { 224 enum zone_watermarks {
225 WMARK_MIN, 225 WMARK_MIN,
226 WMARK_LOW, 226 WMARK_LOW,
227 WMARK_HIGH, 227 WMARK_HIGH,
228 NR_WMARK 228 NR_WMARK
229 }; 229 };
230 230
231 #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) 231 #define min_wmark_pages(z) (z->watermark[WMARK_MIN])
232 #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) 232 #define low_wmark_pages(z) (z->watermark[WMARK_LOW])
233 #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) 233 #define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
234 234
235 struct per_cpu_pages { 235 struct per_cpu_pages {
236 int count; /* number of pages in the list */ 236 int count; /* number of pages in the list */
237 int high; /* high watermark, emptying needed */ 237 int high; /* high watermark, emptying needed */
238 int batch; /* chunk size for buddy add/remove */ 238 int batch; /* chunk size for buddy add/remove */
239 239
240 /* Lists of pages, one per migrate type stored on the pcp-lists */ 240 /* Lists of pages, one per migrate type stored on the pcp-lists */
241 struct list_head lists[MIGRATE_PCPTYPES]; 241 struct list_head lists[MIGRATE_PCPTYPES];
242 }; 242 };
243 243
244 struct per_cpu_pageset { 244 struct per_cpu_pageset {
245 struct per_cpu_pages pcp; 245 struct per_cpu_pages pcp;
246 #ifdef CONFIG_NUMA 246 #ifdef CONFIG_NUMA
247 s8 expire; 247 s8 expire;
248 #endif 248 #endif
249 #ifdef CONFIG_SMP 249 #ifdef CONFIG_SMP
250 s8 stat_threshold; 250 s8 stat_threshold;
251 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 251 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
252 #endif 252 #endif
253 }; 253 };
254 254
255 #endif /* !__GENERATING_BOUNDS.H */ 255 #endif /* !__GENERATING_BOUNDS.H */
256 256
257 enum zone_type { 257 enum zone_type {
258 #ifdef CONFIG_ZONE_DMA 258 #ifdef CONFIG_ZONE_DMA
259 /* 259 /*
260 * ZONE_DMA is used when there are devices that are not able 260 * ZONE_DMA is used when there are devices that are not able
261 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we 261 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
262 * carve out the portion of memory that is needed for these devices. 262 * carve out the portion of memory that is needed for these devices.
263 * The range is arch specific. 263 * The range is arch specific.
264 * 264 *
265 * Some examples 265 * Some examples
266 * 266 *
267 * Architecture Limit 267 * Architecture Limit
268 * --------------------------- 268 * ---------------------------
269 * parisc, ia64, sparc <4G 269 * parisc, ia64, sparc <4G
270 * s390 <2G 270 * s390 <2G
271 * arm Various 271 * arm Various
272 * alpha Unlimited or 0-16MB. 272 * alpha Unlimited or 0-16MB.
273 * 273 *
274 * i386, x86_64 and multiple other arches 274 * i386, x86_64 and multiple other arches
275 * <16M. 275 * <16M.
276 */ 276 */
277 ZONE_DMA, 277 ZONE_DMA,
278 #endif 278 #endif
279 #ifdef CONFIG_ZONE_DMA32 279 #ifdef CONFIG_ZONE_DMA32
280 /* 280 /*
281 * x86_64 needs two ZONE_DMAs because it supports devices that are 281 * x86_64 needs two ZONE_DMAs because it supports devices that are
282 * only able to do DMA to the lower 16M but also 32 bit devices that 282 * only able to do DMA to the lower 16M but also 32 bit devices that
283 * can only do DMA areas below 4G. 283 * can only do DMA areas below 4G.
284 */ 284 */
285 ZONE_DMA32, 285 ZONE_DMA32,
286 #endif 286 #endif
287 /* 287 /*
288 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be 288 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
289 * performed on pages in ZONE_NORMAL if the DMA devices support 289 * performed on pages in ZONE_NORMAL if the DMA devices support
290 * transfers to all addressable memory. 290 * transfers to all addressable memory.
291 */ 291 */
292 ZONE_NORMAL, 292 ZONE_NORMAL,
293 #ifdef CONFIG_HIGHMEM 293 #ifdef CONFIG_HIGHMEM
294 /* 294 /*
295 * A memory area that is only addressable by the kernel through 295 * A memory area that is only addressable by the kernel through
296 * mapping portions into its own address space. This is for example 296 * mapping portions into its own address space. This is for example
297 * used by i386 to allow the kernel to address the memory beyond 297 * used by i386 to allow the kernel to address the memory beyond
298 * 900MB. The kernel will set up special mappings (page 298 * 900MB. The kernel will set up special mappings (page
299 * table entries on i386) for each page that the kernel needs to 299 * table entries on i386) for each page that the kernel needs to
300 * access. 300 * access.
301 */ 301 */
302 ZONE_HIGHMEM, 302 ZONE_HIGHMEM,
303 #endif 303 #endif
304 ZONE_MOVABLE, 304 ZONE_MOVABLE,
305 __MAX_NR_ZONES 305 __MAX_NR_ZONES
306 }; 306 };
307 307
308 #ifndef __GENERATING_BOUNDS_H 308 #ifndef __GENERATING_BOUNDS_H
309 309
310 /* 310 /*
311 * When a memory allocation must conform to specific limitations (such 311 * When a memory allocation must conform to specific limitations (such
312 * as being suitable for DMA) the caller will pass in hints to the 312 * as being suitable for DMA) the caller will pass in hints to the
313 * allocator in the gfp_mask, in the zone modifier bits. These bits 313 * allocator in the gfp_mask, in the zone modifier bits. These bits
314 * are used to select a priority ordered list of memory zones which 314 * are used to select a priority ordered list of memory zones which
315 * match the requested limits. See gfp_zone() in include/linux/gfp.h 315 * match the requested limits. See gfp_zone() in include/linux/gfp.h
316 */ 316 */
317 317
318 #if MAX_NR_ZONES < 2 318 #if MAX_NR_ZONES < 2
319 #define ZONES_SHIFT 0 319 #define ZONES_SHIFT 0
320 #elif MAX_NR_ZONES <= 2 320 #elif MAX_NR_ZONES <= 2
321 #define ZONES_SHIFT 1 321 #define ZONES_SHIFT 1
322 #elif MAX_NR_ZONES <= 4 322 #elif MAX_NR_ZONES <= 4
323 #define ZONES_SHIFT 2 323 #define ZONES_SHIFT 2
324 #else 324 #else
325 #error ZONES_SHIFT -- too many zones configured adjust calculation 325 #error ZONES_SHIFT -- too many zones configured adjust calculation
326 #endif 326 #endif
327 327
328 struct zone { 328 struct zone {
329 /* Fields commonly accessed by the page allocator */ 329 /* Fields commonly accessed by the page allocator */
330 330
331 /* zone watermarks, access with *_wmark_pages(zone) macros */ 331 /* zone watermarks, access with *_wmark_pages(zone) macros */
332 unsigned long watermark[NR_WMARK]; 332 unsigned long watermark[NR_WMARK];
333 333
334 /* 334 /*
335 * When free pages are below this point, additional steps are taken 335 * When free pages are below this point, additional steps are taken
336 * when reading the number of free pages to avoid per-cpu counter 336 * when reading the number of free pages to avoid per-cpu counter
337 * drift allowing watermarks to be breached 337 * drift allowing watermarks to be breached
338 */ 338 */
339 unsigned long percpu_drift_mark; 339 unsigned long percpu_drift_mark;
340 340
341 /* 341 /*
342 * We don't know if the memory that we're going to allocate will be freeable 342 * We don't know if the memory that we're going to allocate will be freeable
343 * or/and it will be released eventually, so to avoid totally wasting several 343 * or/and it will be released eventually, so to avoid totally wasting several
344 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 344 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
345 * to run OOM on the lower zones despite there's tons of freeable ram 345 * to run OOM on the lower zones despite there's tons of freeable ram
346 * on the higher zones). This array is recalculated at runtime if the 346 * on the higher zones). This array is recalculated at runtime if the
347 * sysctl_lowmem_reserve_ratio sysctl changes. 347 * sysctl_lowmem_reserve_ratio sysctl changes.
348 */ 348 */
349 unsigned long lowmem_reserve[MAX_NR_ZONES]; 349 unsigned long lowmem_reserve[MAX_NR_ZONES];
350 350
351 /* 351 /*
352 * This is a per-zone reserve of pages that should not be 352 * This is a per-zone reserve of pages that should not be
353 * considered dirtyable memory. 353 * considered dirtyable memory.
354 */ 354 */
355 unsigned long dirty_balance_reserve; 355 unsigned long dirty_balance_reserve;
356 356
357 #ifdef CONFIG_NUMA 357 #ifdef CONFIG_NUMA
358 int node; 358 int node;
359 /* 359 /*
360 * zone reclaim becomes active if more unmapped pages exist. 360 * zone reclaim becomes active if more unmapped pages exist.
361 */ 361 */
362 unsigned long min_unmapped_pages; 362 unsigned long min_unmapped_pages;
363 unsigned long min_slab_pages; 363 unsigned long min_slab_pages;
364 #endif 364 #endif
365 struct per_cpu_pageset __percpu *pageset; 365 struct per_cpu_pageset __percpu *pageset;
366 /* 366 /*
367 * free areas of different sizes 367 * free areas of different sizes
368 */ 368 */
369 spinlock_t lock; 369 spinlock_t lock;
370 int all_unreclaimable; /* All pages pinned */ 370 int all_unreclaimable; /* All pages pinned */
371 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
372 /* pfn where the last incremental compaction isolated free pages */
373 unsigned long compact_cached_free_pfn;
374 #endif
371 #ifdef CONFIG_MEMORY_HOTPLUG 375 #ifdef CONFIG_MEMORY_HOTPLUG
372 /* see spanned/present_pages for more description */ 376 /* see spanned/present_pages for more description */
373 seqlock_t span_seqlock; 377 seqlock_t span_seqlock;
374 #endif 378 #endif
375 #ifdef CONFIG_CMA 379 #ifdef CONFIG_CMA
376 /* 380 /*
377 * CMA needs to increase watermark levels during the allocation 381 * CMA needs to increase watermark levels during the allocation
378 * process to make sure that the system is not starved. 382 * process to make sure that the system is not starved.
379 */ 383 */
380 unsigned long min_cma_pages; 384 unsigned long min_cma_pages;
381 #endif 385 #endif
382 struct free_area free_area[MAX_ORDER]; 386 struct free_area free_area[MAX_ORDER];
383 387
384 #ifndef CONFIG_SPARSEMEM 388 #ifndef CONFIG_SPARSEMEM
385 /* 389 /*
386 * Flags for a pageblock_nr_pages block. See pageblock-flags.h. 390 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
387 * In SPARSEMEM, this map is stored in struct mem_section 391 * In SPARSEMEM, this map is stored in struct mem_section
388 */ 392 */
389 unsigned long *pageblock_flags; 393 unsigned long *pageblock_flags;
390 #endif /* CONFIG_SPARSEMEM */ 394 #endif /* CONFIG_SPARSEMEM */
391 395
392 #ifdef CONFIG_COMPACTION 396 #ifdef CONFIG_COMPACTION
393 /* 397 /*
394 * On compaction failure, 1<<compact_defer_shift compactions 398 * On compaction failure, 1<<compact_defer_shift compactions
395 * are skipped before trying again. The number attempted since 399 * are skipped before trying again. The number attempted since
396 * last failure is tracked with compact_considered. 400 * last failure is tracked with compact_considered.
397 */ 401 */
398 unsigned int compact_considered; 402 unsigned int compact_considered;
399 unsigned int compact_defer_shift; 403 unsigned int compact_defer_shift;
400 int compact_order_failed; 404 int compact_order_failed;
401 #endif 405 #endif
402 406
403 ZONE_PADDING(_pad1_) 407 ZONE_PADDING(_pad1_)
404 408
405 /* Fields commonly accessed by the page reclaim scanner */ 409 /* Fields commonly accessed by the page reclaim scanner */
406 spinlock_t lru_lock; 410 spinlock_t lru_lock;
407 struct lruvec lruvec; 411 struct lruvec lruvec;
408 412
409 unsigned long pages_scanned; /* since last reclaim */ 413 unsigned long pages_scanned; /* since last reclaim */
410 unsigned long flags; /* zone flags, see below */ 414 unsigned long flags; /* zone flags, see below */
411 415
412 /* Zone statistics */ 416 /* Zone statistics */
413 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 417 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
414 418
415 /* 419 /*
416 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on 420 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
417 * this zone's LRU. Maintained by the pageout code. 421 * this zone's LRU. Maintained by the pageout code.
418 */ 422 */
419 unsigned int inactive_ratio; 423 unsigned int inactive_ratio;
420 424
421 425
422 ZONE_PADDING(_pad2_) 426 ZONE_PADDING(_pad2_)
423 /* Rarely used or read-mostly fields */ 427 /* Rarely used or read-mostly fields */
424 428
425 /* 429 /*
426 * wait_table -- the array holding the hash table 430 * wait_table -- the array holding the hash table
427 * wait_table_hash_nr_entries -- the size of the hash table array 431 * wait_table_hash_nr_entries -- the size of the hash table array
428 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 432 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
429 * 433 *
430 * The purpose of all these is to keep track of the people 434 * The purpose of all these is to keep track of the people
431 * waiting for a page to become available and make them 435 * waiting for a page to become available and make them
432 * runnable again when possible. The trouble is that this 436 * runnable again when possible. The trouble is that this
433 * consumes a lot of space, especially when so few things 437 * consumes a lot of space, especially when so few things
434 * wait on pages at a given time. So instead of using 438 * wait on pages at a given time. So instead of using
435 * per-page waitqueues, we use a waitqueue hash table. 439 * per-page waitqueues, we use a waitqueue hash table.
436 * 440 *
437 * The bucket discipline is to sleep on the same queue when 441 * The bucket discipline is to sleep on the same queue when
438 * colliding and wake all in that wait queue when removing. 442 * colliding and wake all in that wait queue when removing.
439 * When something wakes, it must check to be sure its page is 443 * When something wakes, it must check to be sure its page is
440 * truly available, a la thundering herd. The cost of a 444 * truly available, a la thundering herd. The cost of a
441 * collision is great, but given the expected load of the 445 * collision is great, but given the expected load of the
442 * table, they should be so rare as to be outweighed by the 446 * table, they should be so rare as to be outweighed by the
443 * benefits from the saved space. 447 * benefits from the saved space.
444 * 448 *
445 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 449 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
446 * primary users of these fields, and in mm/page_alloc.c 450 * primary users of these fields, and in mm/page_alloc.c
447 * free_area_init_core() performs the initialization of them. 451 * free_area_init_core() performs the initialization of them.
448 */ 452 */
449 wait_queue_head_t * wait_table; 453 wait_queue_head_t * wait_table;
450 unsigned long wait_table_hash_nr_entries; 454 unsigned long wait_table_hash_nr_entries;
451 unsigned long wait_table_bits; 455 unsigned long wait_table_bits;
452 456
453 /* 457 /*
454 * Discontig memory support fields. 458 * Discontig memory support fields.
455 */ 459 */
456 struct pglist_data *zone_pgdat; 460 struct pglist_data *zone_pgdat;
457 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 461 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
458 unsigned long zone_start_pfn; 462 unsigned long zone_start_pfn;
459 463
460 /* 464 /*
461 * zone_start_pfn, spanned_pages and present_pages are all 465 * zone_start_pfn, spanned_pages and present_pages are all
462 * protected by span_seqlock. It is a seqlock because it has 466 * protected by span_seqlock. It is a seqlock because it has
463 * to be read outside of zone->lock, and it is done in the main 467 * to be read outside of zone->lock, and it is done in the main
464 * allocator path. But, it is written quite infrequently. 468 * allocator path. But, it is written quite infrequently.
465 * 469 *
466 * The lock is declared along with zone->lock because it is 470 * The lock is declared along with zone->lock because it is
467 * frequently read in proximity to zone->lock. It's good to 471 * frequently read in proximity to zone->lock. It's good to
468 * give them a chance of being in the same cacheline. 472 * give them a chance of being in the same cacheline.
469 */ 473 */
470 unsigned long spanned_pages; /* total size, including holes */ 474 unsigned long spanned_pages; /* total size, including holes */
471 unsigned long present_pages; /* amount of memory (excluding holes) */ 475 unsigned long present_pages; /* amount of memory (excluding holes) */
472 476
473 /* 477 /*
474 * rarely used fields: 478 * rarely used fields:
475 */ 479 */
476 const char *name; 480 const char *name;
477 } ____cacheline_internodealigned_in_smp; 481 } ____cacheline_internodealigned_in_smp;
478 482
479 typedef enum { 483 typedef enum {
480 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 484 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
481 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ 485 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
482 ZONE_CONGESTED, /* zone has many dirty pages backed by 486 ZONE_CONGESTED, /* zone has many dirty pages backed by
483 * a congested BDI 487 * a congested BDI
484 */ 488 */
485 } zone_flags_t; 489 } zone_flags_t;
486 490
487 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) 491 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
488 { 492 {
489 set_bit(flag, &zone->flags); 493 set_bit(flag, &zone->flags);
490 } 494 }
491 495
492 static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag) 496 static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag)
493 { 497 {
494 return test_and_set_bit(flag, &zone->flags); 498 return test_and_set_bit(flag, &zone->flags);
495 } 499 }
496 500
497 static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) 501 static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
498 { 502 {
499 clear_bit(flag, &zone->flags); 503 clear_bit(flag, &zone->flags);
500 } 504 }
501 505
502 static inline int zone_is_reclaim_congested(const struct zone *zone) 506 static inline int zone_is_reclaim_congested(const struct zone *zone)
503 { 507 {
504 return test_bit(ZONE_CONGESTED, &zone->flags); 508 return test_bit(ZONE_CONGESTED, &zone->flags);
505 } 509 }
506 510
507 static inline int zone_is_reclaim_locked(const struct zone *zone) 511 static inline int zone_is_reclaim_locked(const struct zone *zone)
508 { 512 {
509 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 513 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
510 } 514 }
511 515
512 static inline int zone_is_oom_locked(const struct zone *zone) 516 static inline int zone_is_oom_locked(const struct zone *zone)
513 { 517 {
514 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 518 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
515 } 519 }
516 520
517 /* 521 /*
518 * The "priority" of VM scanning is how much of the queues we will scan in one 522 * The "priority" of VM scanning is how much of the queues we will scan in one
519 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 523 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
520 * queues ("queue_length >> 12") during an aging round. 524 * queues ("queue_length >> 12") during an aging round.
521 */ 525 */
522 #define DEF_PRIORITY 12 526 #define DEF_PRIORITY 12
523 527
524 /* Maximum number of zones on a zonelist */ 528 /* Maximum number of zones on a zonelist */
525 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) 529 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
526 530
527 #ifdef CONFIG_NUMA 531 #ifdef CONFIG_NUMA
528 532
529 /* 533 /*
530 * The NUMA zonelists are doubled because we need zonelists that restrict the 534 * The NUMA zonelists are doubled because we need zonelists that restrict the
531 * allocations to a single node for GFP_THISNODE. 535 * allocations to a single node for GFP_THISNODE.
532 * 536 *
533 * [0] : Zonelist with fallback 537 * [0] : Zonelist with fallback
534 * [1] : No fallback (GFP_THISNODE) 538 * [1] : No fallback (GFP_THISNODE)
535 */ 539 */
536 #define MAX_ZONELISTS 2 540 #define MAX_ZONELISTS 2
537 541
538 542
539 /* 543 /*
540 * We cache key information from each zonelist for smaller cache 544 * We cache key information from each zonelist for smaller cache
541 * footprint when scanning for free pages in get_page_from_freelist(). 545 * footprint when scanning for free pages in get_page_from_freelist().
542 * 546 *
543 * 1) The BITMAP fullzones tracks which zones in a zonelist have come 547 * 1) The BITMAP fullzones tracks which zones in a zonelist have come
544 * up short of free memory since the last time (last_fullzone_zap) 548 * up short of free memory since the last time (last_fullzone_zap)
545 * we zero'd fullzones. 549 * we zero'd fullzones.
546 * 2) The array z_to_n[] maps each zone in the zonelist to its node 550 * 2) The array z_to_n[] maps each zone in the zonelist to its node
547 * id, so that we can efficiently evaluate whether that node is 551 * id, so that we can efficiently evaluate whether that node is
548 * set in the current tasks mems_allowed. 552 * set in the current tasks mems_allowed.
549 * 553 *
550 * Both fullzones and z_to_n[] are one-to-one with the zonelist, 554 * Both fullzones and z_to_n[] are one-to-one with the zonelist,
551 * indexed by a zones offset in the zonelist zones[] array. 555 * indexed by a zones offset in the zonelist zones[] array.
552 * 556 *
553 * The get_page_from_freelist() routine does two scans. During the 557 * The get_page_from_freelist() routine does two scans. During the
554 * first scan, we skip zones whose corresponding bit in 'fullzones' 558 * first scan, we skip zones whose corresponding bit in 'fullzones'
555 * is set or whose corresponding node in current->mems_allowed (which 559 * is set or whose corresponding node in current->mems_allowed (which
556 * comes from cpusets) is not set. During the second scan, we bypass 560 * comes from cpusets) is not set. During the second scan, we bypass
557 * this zonelist_cache, to ensure we look methodically at each zone. 561 * this zonelist_cache, to ensure we look methodically at each zone.
558 * 562 *
559 * Once per second, we zero out (zap) fullzones, forcing us to 563 * Once per second, we zero out (zap) fullzones, forcing us to
560 * reconsider nodes that might have regained more free memory. 564 * reconsider nodes that might have regained more free memory.
561 * The field last_full_zap is the time we last zapped fullzones. 565 * The field last_full_zap is the time we last zapped fullzones.
562 * 566 *
563 * This mechanism reduces the amount of time we waste repeatedly 567 * This mechanism reduces the amount of time we waste repeatedly
564 * reexaming zones for free memory when they just came up low on 568 * reexaming zones for free memory when they just came up low on
565 * memory momentarilly ago. 569 * memory momentarilly ago.
566 * 570 *
567 * The zonelist_cache struct members logically belong in struct 571 * The zonelist_cache struct members logically belong in struct
568 * zonelist. However, the mempolicy zonelists constructed for 572 * zonelist. However, the mempolicy zonelists constructed for
569 * MPOL_BIND are intentionally variable length (and usually much 573 * MPOL_BIND are intentionally variable length (and usually much
570 * shorter). A general purpose mechanism for handling structs with 574 * shorter). A general purpose mechanism for handling structs with
571 * multiple variable length members is more mechanism than we want 575 * multiple variable length members is more mechanism than we want
572 * here. We resort to some special case hackery instead. 576 * here. We resort to some special case hackery instead.
573 * 577 *
574 * The MPOL_BIND zonelists don't need this zonelist_cache (in good 578 * The MPOL_BIND zonelists don't need this zonelist_cache (in good
575 * part because they are shorter), so we put the fixed length stuff 579 * part because they are shorter), so we put the fixed length stuff
576 * at the front of the zonelist struct, ending in a variable length 580 * at the front of the zonelist struct, ending in a variable length
577 * zones[], as is needed by MPOL_BIND. 581 * zones[], as is needed by MPOL_BIND.
578 * 582 *
579 * Then we put the optional zonelist cache on the end of the zonelist 583 * Then we put the optional zonelist cache on the end of the zonelist
580 * struct. This optional stuff is found by a 'zlcache_ptr' pointer in 584 * struct. This optional stuff is found by a 'zlcache_ptr' pointer in
581 * the fixed length portion at the front of the struct. This pointer 585 * the fixed length portion at the front of the struct. This pointer
582 * both enables us to find the zonelist cache, and in the case of 586 * both enables us to find the zonelist cache, and in the case of
583 * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) 587 * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
584 * to know that the zonelist cache is not there. 588 * to know that the zonelist cache is not there.
585 * 589 *
586 * The end result is that struct zonelists come in two flavors: 590 * The end result is that struct zonelists come in two flavors:
587 * 1) The full, fixed length version, shown below, and 591 * 1) The full, fixed length version, shown below, and
588 * 2) The custom zonelists for MPOL_BIND. 592 * 2) The custom zonelists for MPOL_BIND.
589 * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. 593 * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
590 * 594 *
591 * Even though there may be multiple CPU cores on a node modifying 595 * Even though there may be multiple CPU cores on a node modifying
592 * fullzones or last_full_zap in the same zonelist_cache at the same 596 * fullzones or last_full_zap in the same zonelist_cache at the same
593 * time, we don't lock it. This is just hint data - if it is wrong now 597 * time, we don't lock it. This is just hint data - if it is wrong now
594 * and then, the allocator will still function, perhaps a bit slower. 598 * and then, the allocator will still function, perhaps a bit slower.
595 */ 599 */
596 600
597 601
598 struct zonelist_cache { 602 struct zonelist_cache {
599 unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ 603 unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
600 DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ 604 DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
601 unsigned long last_full_zap; /* when last zap'd (jiffies) */ 605 unsigned long last_full_zap; /* when last zap'd (jiffies) */
602 }; 606 };
603 #else 607 #else
604 #define MAX_ZONELISTS 1 608 #define MAX_ZONELISTS 1
605 struct zonelist_cache; 609 struct zonelist_cache;
606 #endif 610 #endif
607 611
608 /* 612 /*
609 * This struct contains information about a zone in a zonelist. It is stored 613 * This struct contains information about a zone in a zonelist. It is stored
610 * here to avoid dereferences into large structures and lookups of tables 614 * here to avoid dereferences into large structures and lookups of tables
611 */ 615 */
612 struct zoneref { 616 struct zoneref {
613 struct zone *zone; /* Pointer to actual zone */ 617 struct zone *zone; /* Pointer to actual zone */
614 int zone_idx; /* zone_idx(zoneref->zone) */ 618 int zone_idx; /* zone_idx(zoneref->zone) */
615 }; 619 };
616 620
617 /* 621 /*
618 * One allocation request operates on a zonelist. A zonelist 622 * One allocation request operates on a zonelist. A zonelist
619 * is a list of zones, the first one is the 'goal' of the 623 * is a list of zones, the first one is the 'goal' of the
620 * allocation, the other zones are fallback zones, in decreasing 624 * allocation, the other zones are fallback zones, in decreasing
621 * priority. 625 * priority.
622 * 626 *
623 * If zlcache_ptr is not NULL, then it is just the address of zlcache, 627 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
624 * as explained above. If zlcache_ptr is NULL, there is no zlcache. 628 * as explained above. If zlcache_ptr is NULL, there is no zlcache.
625 * * 629 * *
626 * To speed the reading of the zonelist, the zonerefs contain the zone index 630 * To speed the reading of the zonelist, the zonerefs contain the zone index
627 * of the entry being read. Helper functions to access information given 631 * of the entry being read. Helper functions to access information given
628 * a struct zoneref are 632 * a struct zoneref are
629 * 633 *
630 * zonelist_zone() - Return the struct zone * for an entry in _zonerefs 634 * zonelist_zone() - Return the struct zone * for an entry in _zonerefs
631 * zonelist_zone_idx() - Return the index of the zone for an entry 635 * zonelist_zone_idx() - Return the index of the zone for an entry
632 * zonelist_node_idx() - Return the index of the node for an entry 636 * zonelist_node_idx() - Return the index of the node for an entry
633 */ 637 */
634 struct zonelist { 638 struct zonelist {
635 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache 639 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
636 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; 640 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
637 #ifdef CONFIG_NUMA 641 #ifdef CONFIG_NUMA
638 struct zonelist_cache zlcache; // optional ... 642 struct zonelist_cache zlcache; // optional ...
639 #endif 643 #endif
640 }; 644 };
641 645
642 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 646 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
643 struct node_active_region { 647 struct node_active_region {
644 unsigned long start_pfn; 648 unsigned long start_pfn;
645 unsigned long end_pfn; 649 unsigned long end_pfn;
646 int nid; 650 int nid;
647 }; 651 };
648 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 652 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
649 653
650 #ifndef CONFIG_DISCONTIGMEM 654 #ifndef CONFIG_DISCONTIGMEM
651 /* The array of struct pages - for discontigmem use pgdat->lmem_map */ 655 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
652 extern struct page *mem_map; 656 extern struct page *mem_map;
653 #endif 657 #endif
654 658
655 /* 659 /*
656 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 660 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
657 * (mostly NUMA machines?) to denote a higher-level memory zone than the 661 * (mostly NUMA machines?) to denote a higher-level memory zone than the
658 * zone denotes. 662 * zone denotes.
659 * 663 *
660 * On NUMA machines, each NUMA node would have a pg_data_t to describe 664 * On NUMA machines, each NUMA node would have a pg_data_t to describe
661 * it's memory layout. 665 * it's memory layout.
662 * 666 *
663 * Memory statistics and page replacement data structures are maintained on a 667 * Memory statistics and page replacement data structures are maintained on a
664 * per-zone basis. 668 * per-zone basis.
665 */ 669 */
666 struct bootmem_data; 670 struct bootmem_data;
667 typedef struct pglist_data { 671 typedef struct pglist_data {
668 struct zone node_zones[MAX_NR_ZONES]; 672 struct zone node_zones[MAX_NR_ZONES];
669 struct zonelist node_zonelists[MAX_ZONELISTS]; 673 struct zonelist node_zonelists[MAX_ZONELISTS];
670 int nr_zones; 674 int nr_zones;
671 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 675 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
672 struct page *node_mem_map; 676 struct page *node_mem_map;
673 #ifdef CONFIG_MEMCG 677 #ifdef CONFIG_MEMCG
674 struct page_cgroup *node_page_cgroup; 678 struct page_cgroup *node_page_cgroup;
675 #endif 679 #endif
676 #endif 680 #endif
677 #ifndef CONFIG_NO_BOOTMEM 681 #ifndef CONFIG_NO_BOOTMEM
678 struct bootmem_data *bdata; 682 struct bootmem_data *bdata;
679 #endif 683 #endif
680 #ifdef CONFIG_MEMORY_HOTPLUG 684 #ifdef CONFIG_MEMORY_HOTPLUG
681 /* 685 /*
682 * Must be held any time you expect node_start_pfn, node_present_pages 686 * Must be held any time you expect node_start_pfn, node_present_pages
683 * or node_spanned_pages stay constant. Holding this will also 687 * or node_spanned_pages stay constant. Holding this will also
684 * guarantee that any pfn_valid() stays that way. 688 * guarantee that any pfn_valid() stays that way.
685 * 689 *
686 * Nests above zone->lock and zone->size_seqlock. 690 * Nests above zone->lock and zone->size_seqlock.
687 */ 691 */
688 spinlock_t node_size_lock; 692 spinlock_t node_size_lock;
689 #endif 693 #endif
690 unsigned long node_start_pfn; 694 unsigned long node_start_pfn;
691 unsigned long node_present_pages; /* total number of physical pages */ 695 unsigned long node_present_pages; /* total number of physical pages */
692 unsigned long node_spanned_pages; /* total size of physical page 696 unsigned long node_spanned_pages; /* total size of physical page
693 range, including holes */ 697 range, including holes */
694 int node_id; 698 int node_id;
695 wait_queue_head_t kswapd_wait; 699 wait_queue_head_t kswapd_wait;
696 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 700 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
697 int kswapd_max_order; 701 int kswapd_max_order;
698 enum zone_type classzone_idx; 702 enum zone_type classzone_idx;
699 } pg_data_t; 703 } pg_data_t;
700 704
701 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 705 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
702 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 706 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
703 #ifdef CONFIG_FLAT_NODE_MEM_MAP 707 #ifdef CONFIG_FLAT_NODE_MEM_MAP
704 #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 708 #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
705 #else 709 #else
706 #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) 710 #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
707 #endif 711 #endif
708 #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 712 #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
709 713
710 #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 714 #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
711 715
712 #define node_end_pfn(nid) ({\ 716 #define node_end_pfn(nid) ({\
713 pg_data_t *__pgdat = NODE_DATA(nid);\ 717 pg_data_t *__pgdat = NODE_DATA(nid);\
714 __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ 718 __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
715 }) 719 })
716 720
717 #include <linux/memory_hotplug.h> 721 #include <linux/memory_hotplug.h>
718 722
719 extern struct mutex zonelists_mutex; 723 extern struct mutex zonelists_mutex;
720 void build_all_zonelists(void *data); 724 void build_all_zonelists(void *data);
721 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 725 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
722 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 726 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
723 int classzone_idx, int alloc_flags); 727 int classzone_idx, int alloc_flags);
724 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 728 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
725 int classzone_idx, int alloc_flags); 729 int classzone_idx, int alloc_flags);
726 enum memmap_context { 730 enum memmap_context {
727 MEMMAP_EARLY, 731 MEMMAP_EARLY,
728 MEMMAP_HOTPLUG, 732 MEMMAP_HOTPLUG,
729 }; 733 };
730 extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, 734 extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
731 unsigned long size, 735 unsigned long size,
732 enum memmap_context context); 736 enum memmap_context context);
733 737
734 extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); 738 extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
735 739
736 static inline struct zone *lruvec_zone(struct lruvec *lruvec) 740 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
737 { 741 {
738 #ifdef CONFIG_MEMCG 742 #ifdef CONFIG_MEMCG
739 return lruvec->zone; 743 return lruvec->zone;
740 #else 744 #else
741 return container_of(lruvec, struct zone, lruvec); 745 return container_of(lruvec, struct zone, lruvec);
742 #endif 746 #endif
743 } 747 }
744 748
745 #ifdef CONFIG_HAVE_MEMORY_PRESENT 749 #ifdef CONFIG_HAVE_MEMORY_PRESENT
746 void memory_present(int nid, unsigned long start, unsigned long end); 750 void memory_present(int nid, unsigned long start, unsigned long end);
747 #else 751 #else
748 static inline void memory_present(int nid, unsigned long start, unsigned long end) {} 752 static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
749 #endif 753 #endif
750 754
751 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 755 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
752 int local_memory_node(int node_id); 756 int local_memory_node(int node_id);
753 #else 757 #else
754 static inline int local_memory_node(int node_id) { return node_id; }; 758 static inline int local_memory_node(int node_id) { return node_id; };
755 #endif 759 #endif
756 760
757 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE 761 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
758 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 762 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
759 #endif 763 #endif
760 764
761 /* 765 /*
762 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 766 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
763 */ 767 */
764 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 768 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
765 769
766 static inline int populated_zone(struct zone *zone) 770 static inline int populated_zone(struct zone *zone)
767 { 771 {
768 return (!!zone->present_pages); 772 return (!!zone->present_pages);
769 } 773 }
770 774
771 extern int movable_zone; 775 extern int movable_zone;
772 776
773 static inline int zone_movable_is_highmem(void) 777 static inline int zone_movable_is_highmem(void)
774 { 778 {
775 #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) 779 #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE)
776 return movable_zone == ZONE_HIGHMEM; 780 return movable_zone == ZONE_HIGHMEM;
777 #else 781 #else
778 return 0; 782 return 0;
779 #endif 783 #endif
780 } 784 }
781 785
782 static inline int is_highmem_idx(enum zone_type idx) 786 static inline int is_highmem_idx(enum zone_type idx)
783 { 787 {
784 #ifdef CONFIG_HIGHMEM 788 #ifdef CONFIG_HIGHMEM
785 return (idx == ZONE_HIGHMEM || 789 return (idx == ZONE_HIGHMEM ||
786 (idx == ZONE_MOVABLE && zone_movable_is_highmem())); 790 (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
787 #else 791 #else
788 return 0; 792 return 0;
789 #endif 793 #endif
790 } 794 }
791 795
792 static inline int is_normal_idx(enum zone_type idx) 796 static inline int is_normal_idx(enum zone_type idx)
793 { 797 {
794 return (idx == ZONE_NORMAL); 798 return (idx == ZONE_NORMAL);
795 } 799 }
796 800
797 /** 801 /**
798 * is_highmem - helper function to quickly check if a struct zone is a 802 * is_highmem - helper function to quickly check if a struct zone is a
799 * highmem zone or not. This is an attempt to keep references 803 * highmem zone or not. This is an attempt to keep references
800 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 804 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
801 * @zone - pointer to struct zone variable 805 * @zone - pointer to struct zone variable
802 */ 806 */
803 static inline int is_highmem(struct zone *zone) 807 static inline int is_highmem(struct zone *zone)
804 { 808 {
805 #ifdef CONFIG_HIGHMEM 809 #ifdef CONFIG_HIGHMEM
806 int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; 810 int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
807 return zone_off == ZONE_HIGHMEM * sizeof(*zone) || 811 return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
808 (zone_off == ZONE_MOVABLE * sizeof(*zone) && 812 (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
809 zone_movable_is_highmem()); 813 zone_movable_is_highmem());
810 #else 814 #else
811 return 0; 815 return 0;
812 #endif 816 #endif
813 } 817 }
814 818
815 static inline int is_normal(struct zone *zone) 819 static inline int is_normal(struct zone *zone)
816 { 820 {
817 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 821 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
818 } 822 }
819 823
820 static inline int is_dma32(struct zone *zone) 824 static inline int is_dma32(struct zone *zone)
821 { 825 {
822 #ifdef CONFIG_ZONE_DMA32 826 #ifdef CONFIG_ZONE_DMA32
823 return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; 827 return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
824 #else 828 #else
825 return 0; 829 return 0;
826 #endif 830 #endif
827 } 831 }
828 832
829 static inline int is_dma(struct zone *zone) 833 static inline int is_dma(struct zone *zone)
830 { 834 {
831 #ifdef CONFIG_ZONE_DMA 835 #ifdef CONFIG_ZONE_DMA
832 return zone == zone->zone_pgdat->node_zones + ZONE_DMA; 836 return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
833 #else 837 #else
834 return 0; 838 return 0;
835 #endif 839 #endif
836 } 840 }
837 841
838 /* These two functions are used to setup the per zone pages min values */ 842 /* These two functions are used to setup the per zone pages min values */
839 struct ctl_table; 843 struct ctl_table;
840 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, 844 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
841 void __user *, size_t *, loff_t *); 845 void __user *, size_t *, loff_t *);
842 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 846 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
843 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, 847 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
844 void __user *, size_t *, loff_t *); 848 void __user *, size_t *, loff_t *);
845 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, 849 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
846 void __user *, size_t *, loff_t *); 850 void __user *, size_t *, loff_t *);
847 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, 851 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
848 void __user *, size_t *, loff_t *); 852 void __user *, size_t *, loff_t *);
849 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, 853 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
850 void __user *, size_t *, loff_t *); 854 void __user *, size_t *, loff_t *);
851 855
852 extern int numa_zonelist_order_handler(struct ctl_table *, int, 856 extern int numa_zonelist_order_handler(struct ctl_table *, int,
853 void __user *, size_t *, loff_t *); 857 void __user *, size_t *, loff_t *);
854 extern char numa_zonelist_order[]; 858 extern char numa_zonelist_order[];
855 #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ 859 #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */
856 860
857 #ifndef CONFIG_NEED_MULTIPLE_NODES 861 #ifndef CONFIG_NEED_MULTIPLE_NODES
858 862
859 extern struct pglist_data contig_page_data; 863 extern struct pglist_data contig_page_data;
860 #define NODE_DATA(nid) (&contig_page_data) 864 #define NODE_DATA(nid) (&contig_page_data)
861 #define NODE_MEM_MAP(nid) mem_map 865 #define NODE_MEM_MAP(nid) mem_map
862 866
863 #else /* CONFIG_NEED_MULTIPLE_NODES */ 867 #else /* CONFIG_NEED_MULTIPLE_NODES */
864 868
865 #include <asm/mmzone.h> 869 #include <asm/mmzone.h>
866 870
867 #endif /* !CONFIG_NEED_MULTIPLE_NODES */ 871 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
868 872
869 extern struct pglist_data *first_online_pgdat(void); 873 extern struct pglist_data *first_online_pgdat(void);
870 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); 874 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
871 extern struct zone *next_zone(struct zone *zone); 875 extern struct zone *next_zone(struct zone *zone);
872 876
873 /** 877 /**
874 * for_each_online_pgdat - helper macro to iterate over all online nodes 878 * for_each_online_pgdat - helper macro to iterate over all online nodes
875 * @pgdat - pointer to a pg_data_t variable 879 * @pgdat - pointer to a pg_data_t variable
876 */ 880 */
877 #define for_each_online_pgdat(pgdat) \ 881 #define for_each_online_pgdat(pgdat) \
878 for (pgdat = first_online_pgdat(); \ 882 for (pgdat = first_online_pgdat(); \
879 pgdat; \ 883 pgdat; \
880 pgdat = next_online_pgdat(pgdat)) 884 pgdat = next_online_pgdat(pgdat))
881 /** 885 /**
882 * for_each_zone - helper macro to iterate over all memory zones 886 * for_each_zone - helper macro to iterate over all memory zones
883 * @zone - pointer to struct zone variable 887 * @zone - pointer to struct zone variable
884 * 888 *
885 * The user only needs to declare the zone variable, for_each_zone 889 * The user only needs to declare the zone variable, for_each_zone
886 * fills it in. 890 * fills it in.
887 */ 891 */
888 #define for_each_zone(zone) \ 892 #define for_each_zone(zone) \
889 for (zone = (first_online_pgdat())->node_zones; \ 893 for (zone = (first_online_pgdat())->node_zones; \
890 zone; \ 894 zone; \
891 zone = next_zone(zone)) 895 zone = next_zone(zone))
892 896
893 #define for_each_populated_zone(zone) \ 897 #define for_each_populated_zone(zone) \
894 for (zone = (first_online_pgdat())->node_zones; \ 898 for (zone = (first_online_pgdat())->node_zones; \
895 zone; \ 899 zone; \
896 zone = next_zone(zone)) \ 900 zone = next_zone(zone)) \
897 if (!populated_zone(zone)) \ 901 if (!populated_zone(zone)) \
898 ; /* do nothing */ \ 902 ; /* do nothing */ \
899 else 903 else
900 904
901 static inline struct zone *zonelist_zone(struct zoneref *zoneref) 905 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
902 { 906 {
903 return zoneref->zone; 907 return zoneref->zone;
904 } 908 }
905 909
906 static inline int zonelist_zone_idx(struct zoneref *zoneref) 910 static inline int zonelist_zone_idx(struct zoneref *zoneref)
907 { 911 {
908 return zoneref->zone_idx; 912 return zoneref->zone_idx;
909 } 913 }
910 914
911 static inline int zonelist_node_idx(struct zoneref *zoneref) 915 static inline int zonelist_node_idx(struct zoneref *zoneref)
912 { 916 {
913 #ifdef CONFIG_NUMA 917 #ifdef CONFIG_NUMA
914 /* zone_to_nid not available in this context */ 918 /* zone_to_nid not available in this context */
915 return zoneref->zone->node; 919 return zoneref->zone->node;
916 #else 920 #else
917 return 0; 921 return 0;
918 #endif /* CONFIG_NUMA */ 922 #endif /* CONFIG_NUMA */
919 } 923 }
920 924
921 /** 925 /**
922 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point 926 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
923 * @z - The cursor used as a starting point for the search 927 * @z - The cursor used as a starting point for the search
924 * @highest_zoneidx - The zone index of the highest zone to return 928 * @highest_zoneidx - The zone index of the highest zone to return
925 * @nodes - An optional nodemask to filter the zonelist with 929 * @nodes - An optional nodemask to filter the zonelist with
926 * @zone - The first suitable zone found is returned via this parameter 930 * @zone - The first suitable zone found is returned via this parameter
927 * 931 *
928 * This function returns the next zone at or below a given zone index that is 932 * This function returns the next zone at or below a given zone index that is
929 * within the allowed nodemask using a cursor as the starting point for the 933 * within the allowed nodemask using a cursor as the starting point for the
930 * search. The zoneref returned is a cursor that represents the current zone 934 * search. The zoneref returned is a cursor that represents the current zone
931 * being examined. It should be advanced by one before calling 935 * being examined. It should be advanced by one before calling
932 * next_zones_zonelist again. 936 * next_zones_zonelist again.
933 */ 937 */
934 struct zoneref *next_zones_zonelist(struct zoneref *z, 938 struct zoneref *next_zones_zonelist(struct zoneref *z,
935 enum zone_type highest_zoneidx, 939 enum zone_type highest_zoneidx,
936 nodemask_t *nodes, 940 nodemask_t *nodes,
937 struct zone **zone); 941 struct zone **zone);
938 942
939 /** 943 /**
940 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist 944 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
941 * @zonelist - The zonelist to search for a suitable zone 945 * @zonelist - The zonelist to search for a suitable zone
942 * @highest_zoneidx - The zone index of the highest zone to return 946 * @highest_zoneidx - The zone index of the highest zone to return
943 * @nodes - An optional nodemask to filter the zonelist with 947 * @nodes - An optional nodemask to filter the zonelist with
944 * @zone - The first suitable zone found is returned via this parameter 948 * @zone - The first suitable zone found is returned via this parameter
945 * 949 *
946 * This function returns the first zone at or below a given zone index that is 950 * This function returns the first zone at or below a given zone index that is
947 * within the allowed nodemask. The zoneref returned is a cursor that can be 951 * within the allowed nodemask. The zoneref returned is a cursor that can be
948 * used to iterate the zonelist with next_zones_zonelist by advancing it by 952 * used to iterate the zonelist with next_zones_zonelist by advancing it by
949 * one before calling. 953 * one before calling.
950 */ 954 */
951 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, 955 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
952 enum zone_type highest_zoneidx, 956 enum zone_type highest_zoneidx,
953 nodemask_t *nodes, 957 nodemask_t *nodes,
954 struct zone **zone) 958 struct zone **zone)
955 { 959 {
956 return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, 960 return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
957 zone); 961 zone);
958 } 962 }
959 963
960 /** 964 /**
961 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask 965 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
962 * @zone - The current zone in the iterator 966 * @zone - The current zone in the iterator
963 * @z - The current pointer within zonelist->zones being iterated 967 * @z - The current pointer within zonelist->zones being iterated
964 * @zlist - The zonelist being iterated 968 * @zlist - The zonelist being iterated
965 * @highidx - The zone index of the highest zone to return 969 * @highidx - The zone index of the highest zone to return
966 * @nodemask - Nodemask allowed by the allocator 970 * @nodemask - Nodemask allowed by the allocator
967 * 971 *
968 * This iterator iterates though all zones at or below a given zone index and 972 * This iterator iterates though all zones at or below a given zone index and
969 * within a given nodemask 973 * within a given nodemask
970 */ 974 */
971 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ 975 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
972 for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ 976 for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
973 zone; \ 977 zone; \
974 z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ 978 z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \
975 979
976 /** 980 /**
977 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index 981 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
978 * @zone - The current zone in the iterator 982 * @zone - The current zone in the iterator
979 * @z - The current pointer within zonelist->zones being iterated 983 * @z - The current pointer within zonelist->zones being iterated
980 * @zlist - The zonelist being iterated 984 * @zlist - The zonelist being iterated
981 * @highidx - The zone index of the highest zone to return 985 * @highidx - The zone index of the highest zone to return
982 * 986 *
983 * This iterator iterates though all zones at or below a given zone index. 987 * This iterator iterates though all zones at or below a given zone index.
984 */ 988 */
985 #define for_each_zone_zonelist(zone, z, zlist, highidx) \ 989 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
986 for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) 990 for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
987 991
988 #ifdef CONFIG_SPARSEMEM 992 #ifdef CONFIG_SPARSEMEM
989 #include <asm/sparsemem.h> 993 #include <asm/sparsemem.h>
990 #endif 994 #endif
991 995
992 #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ 996 #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
993 !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) 997 !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
994 static inline unsigned long early_pfn_to_nid(unsigned long pfn) 998 static inline unsigned long early_pfn_to_nid(unsigned long pfn)
995 { 999 {
996 return 0; 1000 return 0;
997 } 1001 }
998 #endif 1002 #endif
999 1003
1000 #ifdef CONFIG_FLATMEM 1004 #ifdef CONFIG_FLATMEM
1001 #define pfn_to_nid(pfn) (0) 1005 #define pfn_to_nid(pfn) (0)
1002 #endif 1006 #endif
1003 1007
1004 #ifdef CONFIG_SPARSEMEM 1008 #ifdef CONFIG_SPARSEMEM
1005 1009
1006 /* 1010 /*
1007 * SECTION_SHIFT #bits space required to store a section # 1011 * SECTION_SHIFT #bits space required to store a section #
1008 * 1012 *
1009 * PA_SECTION_SHIFT physical address to/from section number 1013 * PA_SECTION_SHIFT physical address to/from section number
1010 * PFN_SECTION_SHIFT pfn to/from section number 1014 * PFN_SECTION_SHIFT pfn to/from section number
1011 */ 1015 */
1012 #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) 1016 #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
1013 1017
1014 #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 1018 #define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
1015 #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 1019 #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
1016 1020
1017 #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) 1021 #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
1018 1022
1019 #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) 1023 #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
1020 #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) 1024 #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
1021 1025
1022 #define SECTION_BLOCKFLAGS_BITS \ 1026 #define SECTION_BLOCKFLAGS_BITS \
1023 ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) 1027 ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
1024 1028
1025 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS 1029 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
1026 #error Allocator MAX_ORDER exceeds SECTION_SIZE 1030 #error Allocator MAX_ORDER exceeds SECTION_SIZE
1027 #endif 1031 #endif
1028 1032
1029 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) 1033 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
1030 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) 1034 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
1031 1035
1032 #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) 1036 #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
1033 #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1037 #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1034 1038
1035 struct page; 1039 struct page;
1036 struct page_cgroup; 1040 struct page_cgroup;
1037 struct mem_section { 1041 struct mem_section {
1038 /* 1042 /*
1039 * This is, logically, a pointer to an array of struct 1043 * This is, logically, a pointer to an array of struct
1040 * pages. However, it is stored with some other magic. 1044 * pages. However, it is stored with some other magic.
1041 * (see sparse.c::sparse_init_one_section()) 1045 * (see sparse.c::sparse_init_one_section())
1042 * 1046 *
1043 * Additionally during early boot we encode node id of 1047 * Additionally during early boot we encode node id of
1044 * the location of the section here to guide allocation. 1048 * the location of the section here to guide allocation.
1045 * (see sparse.c::memory_present()) 1049 * (see sparse.c::memory_present())
1046 * 1050 *
1047 * Making it a UL at least makes someone do a cast 1051 * Making it a UL at least makes someone do a cast
1048 * before using it wrong. 1052 * before using it wrong.
1049 */ 1053 */
1050 unsigned long section_mem_map; 1054 unsigned long section_mem_map;
1051 1055
1052 /* See declaration of similar field in struct zone */ 1056 /* See declaration of similar field in struct zone */
1053 unsigned long *pageblock_flags; 1057 unsigned long *pageblock_flags;
1054 #ifdef CONFIG_MEMCG 1058 #ifdef CONFIG_MEMCG
1055 /* 1059 /*
1056 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use 1060 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
1057 * section. (see memcontrol.h/page_cgroup.h about this.) 1061 * section. (see memcontrol.h/page_cgroup.h about this.)
1058 */ 1062 */
1059 struct page_cgroup *page_cgroup; 1063 struct page_cgroup *page_cgroup;
1060 unsigned long pad; 1064 unsigned long pad;
1061 #endif 1065 #endif
1062 }; 1066 };
1063 1067
1064 #ifdef CONFIG_SPARSEMEM_EXTREME 1068 #ifdef CONFIG_SPARSEMEM_EXTREME
1065 #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) 1069 #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
1066 #else 1070 #else
1067 #define SECTIONS_PER_ROOT 1 1071 #define SECTIONS_PER_ROOT 1
1068 #endif 1072 #endif
1069 1073
1070 #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) 1074 #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
1071 #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) 1075 #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
1072 #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) 1076 #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
1073 1077
1074 #ifdef CONFIG_SPARSEMEM_EXTREME 1078 #ifdef CONFIG_SPARSEMEM_EXTREME
1075 extern struct mem_section *mem_section[NR_SECTION_ROOTS]; 1079 extern struct mem_section *mem_section[NR_SECTION_ROOTS];
1076 #else 1080 #else
1077 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; 1081 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
1078 #endif 1082 #endif
1079 1083
1080 static inline struct mem_section *__nr_to_section(unsigned long nr) 1084 static inline struct mem_section *__nr_to_section(unsigned long nr)
1081 { 1085 {
1082 if (!mem_section[SECTION_NR_TO_ROOT(nr)]) 1086 if (!mem_section[SECTION_NR_TO_ROOT(nr)])
1083 return NULL; 1087 return NULL;
1084 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; 1088 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
1085 } 1089 }
1086 extern int __section_nr(struct mem_section* ms); 1090 extern int __section_nr(struct mem_section* ms);
1087 extern unsigned long usemap_size(void); 1091 extern unsigned long usemap_size(void);
1088 1092
1089 /* 1093 /*
1090 * We use the lower bits of the mem_map pointer to store 1094 * We use the lower bits of the mem_map pointer to store
1091 * a little bit of information. There should be at least 1095 * a little bit of information. There should be at least
1092 * 3 bits here due to 32-bit alignment. 1096 * 3 bits here due to 32-bit alignment.
1093 */ 1097 */
1094 #define SECTION_MARKED_PRESENT (1UL<<0) 1098 #define SECTION_MARKED_PRESENT (1UL<<0)
1095 #define SECTION_HAS_MEM_MAP (1UL<<1) 1099 #define SECTION_HAS_MEM_MAP (1UL<<1)
1096 #define SECTION_MAP_LAST_BIT (1UL<<2) 1100 #define SECTION_MAP_LAST_BIT (1UL<<2)
1097 #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) 1101 #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
1098 #define SECTION_NID_SHIFT 2 1102 #define SECTION_NID_SHIFT 2
1099 1103
1100 static inline struct page *__section_mem_map_addr(struct mem_section *section) 1104 static inline struct page *__section_mem_map_addr(struct mem_section *section)
1101 { 1105 {
1102 unsigned long map = section->section_mem_map; 1106 unsigned long map = section->section_mem_map;
1103 map &= SECTION_MAP_MASK; 1107 map &= SECTION_MAP_MASK;
1104 return (struct page *)map; 1108 return (struct page *)map;
1105 } 1109 }
1106 1110
1107 static inline int present_section(struct mem_section *section) 1111 static inline int present_section(struct mem_section *section)
1108 { 1112 {
1109 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); 1113 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
1110 } 1114 }
1111 1115
1112 static inline int present_section_nr(unsigned long nr) 1116 static inline int present_section_nr(unsigned long nr)
1113 { 1117 {
1114 return present_section(__nr_to_section(nr)); 1118 return present_section(__nr_to_section(nr));
1115 } 1119 }
1116 1120
1117 static inline int valid_section(struct mem_section *section) 1121 static inline int valid_section(struct mem_section *section)
1118 { 1122 {
1119 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); 1123 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
1120 } 1124 }
1121 1125
1122 static inline int valid_section_nr(unsigned long nr) 1126 static inline int valid_section_nr(unsigned long nr)
1123 { 1127 {
1124 return valid_section(__nr_to_section(nr)); 1128 return valid_section(__nr_to_section(nr));
1125 } 1129 }
1126 1130
1127 static inline struct mem_section *__pfn_to_section(unsigned long pfn) 1131 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
1128 { 1132 {
1129 return __nr_to_section(pfn_to_section_nr(pfn)); 1133 return __nr_to_section(pfn_to_section_nr(pfn));
1130 } 1134 }
1131 1135
1132 #ifndef CONFIG_HAVE_ARCH_PFN_VALID 1136 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
1133 static inline int pfn_valid(unsigned long pfn) 1137 static inline int pfn_valid(unsigned long pfn)
1134 { 1138 {
1135 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1139 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1136 return 0; 1140 return 0;
1137 return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); 1141 return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
1138 } 1142 }
1139 #endif 1143 #endif
1140 1144
1141 static inline int pfn_present(unsigned long pfn) 1145 static inline int pfn_present(unsigned long pfn)
1142 { 1146 {
1143 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1147 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1144 return 0; 1148 return 0;
1145 return present_section(__nr_to_section(pfn_to_section_nr(pfn))); 1149 return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
1146 } 1150 }
1147 1151
1148 /* 1152 /*
1149 * These are _only_ used during initialisation, therefore they 1153 * These are _only_ used during initialisation, therefore they
1150 * can use __initdata ... They could have names to indicate 1154 * can use __initdata ... They could have names to indicate
1151 * this restriction. 1155 * this restriction.
1152 */ 1156 */
1153 #ifdef CONFIG_NUMA 1157 #ifdef CONFIG_NUMA
1154 #define pfn_to_nid(pfn) \ 1158 #define pfn_to_nid(pfn) \
1155 ({ \ 1159 ({ \
1156 unsigned long __pfn_to_nid_pfn = (pfn); \ 1160 unsigned long __pfn_to_nid_pfn = (pfn); \
1157 page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ 1161 page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
1158 }) 1162 })
1159 #else 1163 #else
1160 #define pfn_to_nid(pfn) (0) 1164 #define pfn_to_nid(pfn) (0)
1161 #endif 1165 #endif
1162 1166
1163 #define early_pfn_valid(pfn) pfn_valid(pfn) 1167 #define early_pfn_valid(pfn) pfn_valid(pfn)
1164 void sparse_init(void); 1168 void sparse_init(void);
1165 #else 1169 #else
1166 #define sparse_init() do {} while (0) 1170 #define sparse_init() do {} while (0)
1167 #define sparse_index_init(_sec, _nid) do {} while (0) 1171 #define sparse_index_init(_sec, _nid) do {} while (0)
1168 #endif /* CONFIG_SPARSEMEM */ 1172 #endif /* CONFIG_SPARSEMEM */
1169 1173
1170 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1174 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1171 bool early_pfn_in_nid(unsigned long pfn, int nid); 1175 bool early_pfn_in_nid(unsigned long pfn, int nid);
1172 #else 1176 #else
1173 #define early_pfn_in_nid(pfn, nid) (1) 1177 #define early_pfn_in_nid(pfn, nid) (1)
1174 #endif 1178 #endif
1175 1179
1176 #ifndef early_pfn_valid 1180 #ifndef early_pfn_valid
1177 #define early_pfn_valid(pfn) (1) 1181 #define early_pfn_valid(pfn) (1)
1178 #endif 1182 #endif
1179 1183
1180 void memory_present(int nid, unsigned long start, unsigned long end); 1184 void memory_present(int nid, unsigned long start, unsigned long end);
1181 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 1185 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
1182 1186
1183 /* 1187 /*
1184 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we 1188 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
1185 * need to check pfn validility within that MAX_ORDER_NR_PAGES block. 1189 * need to check pfn validility within that MAX_ORDER_NR_PAGES block.
1186 * pfn_valid_within() should be used in this case; we optimise this away 1190 * pfn_valid_within() should be used in this case; we optimise this away
1187 * when we have no holes within a MAX_ORDER_NR_PAGES block. 1191 * when we have no holes within a MAX_ORDER_NR_PAGES block.
1188 */ 1192 */
1189 #ifdef CONFIG_HOLES_IN_ZONE 1193 #ifdef CONFIG_HOLES_IN_ZONE
1190 #define pfn_valid_within(pfn) pfn_valid(pfn) 1194 #define pfn_valid_within(pfn) pfn_valid(pfn)
1191 #else 1195 #else
1192 #define pfn_valid_within(pfn) (1) 1196 #define pfn_valid_within(pfn) (1)
1193 #endif 1197 #endif
1194 1198
1195 #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL 1199 #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
1196 /* 1200 /*
1197 * pfn_valid() is meant to be able to tell if a given PFN has valid memmap 1201 * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
1198 * associated with it or not. In FLATMEM, it is expected that holes always 1202 * associated with it or not. In FLATMEM, it is expected that holes always
1199 * have valid memmap as long as there is valid PFNs either side of the hole. 1203 * have valid memmap as long as there is valid PFNs either side of the hole.
1200 * In SPARSEMEM, it is assumed that a valid section has a memmap for the 1204 * In SPARSEMEM, it is assumed that a valid section has a memmap for the
1201 * entire section. 1205 * entire section.
1202 * 1206 *
1203 * However, an ARM, and maybe other embedded architectures in the future 1207 * However, an ARM, and maybe other embedded architectures in the future
1204 * free memmap backing holes to save memory on the assumption the memmap is 1208 * free memmap backing holes to save memory on the assumption the memmap is
1205 * never used. The page_zone linkages are then broken even though pfn_valid() 1209 * never used. The page_zone linkages are then broken even though pfn_valid()
1206 * returns true. A walker of the full memmap must then do this additional 1210 * returns true. A walker of the full memmap must then do this additional
1207 * check to ensure the memmap they are looking at is sane by making sure 1211 * check to ensure the memmap they are looking at is sane by making sure
1208 * the zone and PFN linkages are still valid. This is expensive, but walkers 1212 * the zone and PFN linkages are still valid. This is expensive, but walkers
1209 * of the full memmap are extremely rare. 1213 * of the full memmap are extremely rare.
1210 */ 1214 */
1211 int memmap_valid_within(unsigned long pfn, 1215 int memmap_valid_within(unsigned long pfn,
1212 struct page *page, struct zone *zone); 1216 struct page *page, struct zone *zone);
1213 #else 1217 #else
1214 static inline int memmap_valid_within(unsigned long pfn, 1218 static inline int memmap_valid_within(unsigned long pfn,
1215 struct page *page, struct zone *zone) 1219 struct page *page, struct zone *zone)
1216 { 1220 {
1217 return 1; 1221 return 1;
1218 } 1222 }
1219 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 1223 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
1220 1224
1221 #endif /* !__GENERATING_BOUNDS.H */ 1225 #endif /* !__GENERATING_BOUNDS.H */
1222 #endif /* !__ASSEMBLY__ */ 1226 #endif /* !__ASSEMBLY__ */
1223 #endif /* _LINUX_MMZONE_H */ 1227 #endif /* _LINUX_MMZONE_H */
1224 1228
1 /* 1 /*
2 * linux/mm/compaction.c 2 * linux/mm/compaction.c
3 * 3 *
4 * Memory compaction for the reduction of external fragmentation. Note that 4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy 5 * this heavily depends upon page migration to do all the real heavy
6 * lifting 6 * lifting
7 * 7 *
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9 */ 9 */
10 #include <linux/swap.h> 10 #include <linux/swap.h>
11 #include <linux/migrate.h> 11 #include <linux/migrate.h>
12 #include <linux/compaction.h> 12 #include <linux/compaction.h>
13 #include <linux/mm_inline.h> 13 #include <linux/mm_inline.h>
14 #include <linux/backing-dev.h> 14 #include <linux/backing-dev.h>
15 #include <linux/sysctl.h> 15 #include <linux/sysctl.h>
16 #include <linux/sysfs.h> 16 #include <linux/sysfs.h>
17 #include "internal.h" 17 #include "internal.h"
18 18
19 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 19 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
20 20
21 #define CREATE_TRACE_POINTS 21 #define CREATE_TRACE_POINTS
22 #include <trace/events/compaction.h> 22 #include <trace/events/compaction.h>
23 23
24 static unsigned long release_freepages(struct list_head *freelist) 24 static unsigned long release_freepages(struct list_head *freelist)
25 { 25 {
26 struct page *page, *next; 26 struct page *page, *next;
27 unsigned long count = 0; 27 unsigned long count = 0;
28 28
29 list_for_each_entry_safe(page, next, freelist, lru) { 29 list_for_each_entry_safe(page, next, freelist, lru) {
30 list_del(&page->lru); 30 list_del(&page->lru);
31 __free_page(page); 31 __free_page(page);
32 count++; 32 count++;
33 } 33 }
34 34
35 return count; 35 return count;
36 } 36 }
37 37
38 static void map_pages(struct list_head *list) 38 static void map_pages(struct list_head *list)
39 { 39 {
40 struct page *page; 40 struct page *page;
41 41
42 list_for_each_entry(page, list, lru) { 42 list_for_each_entry(page, list, lru) {
43 arch_alloc_page(page, 0); 43 arch_alloc_page(page, 0);
44 kernel_map_pages(page, 1, 1); 44 kernel_map_pages(page, 1, 1);
45 } 45 }
46 } 46 }
47 47
48 static inline bool migrate_async_suitable(int migratetype) 48 static inline bool migrate_async_suitable(int migratetype)
49 { 49 {
50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51 } 51 }
52 52
53 /* 53 /*
54 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 54 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56 * pages inside of the pageblock (even though it may still end up isolating 56 * pages inside of the pageblock (even though it may still end up isolating
57 * some pages). 57 * some pages).
58 */ 58 */
59 static unsigned long isolate_freepages_block(unsigned long blockpfn, 59 static unsigned long isolate_freepages_block(unsigned long blockpfn,
60 unsigned long end_pfn, 60 unsigned long end_pfn,
61 struct list_head *freelist, 61 struct list_head *freelist,
62 bool strict) 62 bool strict)
63 { 63 {
64 int nr_scanned = 0, total_isolated = 0; 64 int nr_scanned = 0, total_isolated = 0;
65 struct page *cursor; 65 struct page *cursor;
66 66
67 cursor = pfn_to_page(blockpfn); 67 cursor = pfn_to_page(blockpfn);
68 68
69 /* Isolate free pages. This assumes the block is valid */ 69 /* Isolate free pages. This assumes the block is valid */
70 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 70 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
71 int isolated, i; 71 int isolated, i;
72 struct page *page = cursor; 72 struct page *page = cursor;
73 73
74 if (!pfn_valid_within(blockpfn)) { 74 if (!pfn_valid_within(blockpfn)) {
75 if (strict) 75 if (strict)
76 return 0; 76 return 0;
77 continue; 77 continue;
78 } 78 }
79 nr_scanned++; 79 nr_scanned++;
80 80
81 if (!PageBuddy(page)) { 81 if (!PageBuddy(page)) {
82 if (strict) 82 if (strict)
83 return 0; 83 return 0;
84 continue; 84 continue;
85 } 85 }
86 86
87 /* Found a free page, break it into order-0 pages */ 87 /* Found a free page, break it into order-0 pages */
88 isolated = split_free_page(page); 88 isolated = split_free_page(page);
89 if (!isolated && strict) 89 if (!isolated && strict)
90 return 0; 90 return 0;
91 total_isolated += isolated; 91 total_isolated += isolated;
92 for (i = 0; i < isolated; i++) { 92 for (i = 0; i < isolated; i++) {
93 list_add(&page->lru, freelist); 93 list_add(&page->lru, freelist);
94 page++; 94 page++;
95 } 95 }
96 96
97 /* If a page was split, advance to the end of it */ 97 /* If a page was split, advance to the end of it */
98 if (isolated) { 98 if (isolated) {
99 blockpfn += isolated - 1; 99 blockpfn += isolated - 1;
100 cursor += isolated - 1; 100 cursor += isolated - 1;
101 } 101 }
102 } 102 }
103 103
104 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 104 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
105 return total_isolated; 105 return total_isolated;
106 } 106 }
107 107
108 /** 108 /**
109 * isolate_freepages_range() - isolate free pages. 109 * isolate_freepages_range() - isolate free pages.
110 * @start_pfn: The first PFN to start isolating. 110 * @start_pfn: The first PFN to start isolating.
111 * @end_pfn: The one-past-last PFN. 111 * @end_pfn: The one-past-last PFN.
112 * 112 *
113 * Non-free pages, invalid PFNs, or zone boundaries within the 113 * Non-free pages, invalid PFNs, or zone boundaries within the
114 * [start_pfn, end_pfn) range are considered errors, cause function to 114 * [start_pfn, end_pfn) range are considered errors, cause function to
115 * undo its actions and return zero. 115 * undo its actions and return zero.
116 * 116 *
117 * Otherwise, function returns one-past-the-last PFN of isolated page 117 * Otherwise, function returns one-past-the-last PFN of isolated page
118 * (which may be greater then end_pfn if end fell in a middle of 118 * (which may be greater then end_pfn if end fell in a middle of
119 * a free page). 119 * a free page).
120 */ 120 */
121 unsigned long 121 unsigned long
122 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) 122 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
123 { 123 {
124 unsigned long isolated, pfn, block_end_pfn, flags; 124 unsigned long isolated, pfn, block_end_pfn, flags;
125 struct zone *zone = NULL; 125 struct zone *zone = NULL;
126 LIST_HEAD(freelist); 126 LIST_HEAD(freelist);
127 127
128 if (pfn_valid(start_pfn)) 128 if (pfn_valid(start_pfn))
129 zone = page_zone(pfn_to_page(start_pfn)); 129 zone = page_zone(pfn_to_page(start_pfn));
130 130
131 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 131 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
132 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) 132 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
133 break; 133 break;
134 134
135 /* 135 /*
136 * On subsequent iterations ALIGN() is actually not needed, 136 * On subsequent iterations ALIGN() is actually not needed,
137 * but we keep it that we not to complicate the code. 137 * but we keep it that we not to complicate the code.
138 */ 138 */
139 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 139 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
140 block_end_pfn = min(block_end_pfn, end_pfn); 140 block_end_pfn = min(block_end_pfn, end_pfn);
141 141
142 spin_lock_irqsave(&zone->lock, flags); 142 spin_lock_irqsave(&zone->lock, flags);
143 isolated = isolate_freepages_block(pfn, block_end_pfn, 143 isolated = isolate_freepages_block(pfn, block_end_pfn,
144 &freelist, true); 144 &freelist, true);
145 spin_unlock_irqrestore(&zone->lock, flags); 145 spin_unlock_irqrestore(&zone->lock, flags);
146 146
147 /* 147 /*
148 * In strict mode, isolate_freepages_block() returns 0 if 148 * In strict mode, isolate_freepages_block() returns 0 if
149 * there are any holes in the block (ie. invalid PFNs or 149 * there are any holes in the block (ie. invalid PFNs or
150 * non-free pages). 150 * non-free pages).
151 */ 151 */
152 if (!isolated) 152 if (!isolated)
153 break; 153 break;
154 154
155 /* 155 /*
156 * If we managed to isolate pages, it is always (1 << n) * 156 * If we managed to isolate pages, it is always (1 << n) *
157 * pageblock_nr_pages for some non-negative n. (Max order 157 * pageblock_nr_pages for some non-negative n. (Max order
158 * page may span two pageblocks). 158 * page may span two pageblocks).
159 */ 159 */
160 } 160 }
161 161
162 /* split_free_page does not map the pages */ 162 /* split_free_page does not map the pages */
163 map_pages(&freelist); 163 map_pages(&freelist);
164 164
165 if (pfn < end_pfn) { 165 if (pfn < end_pfn) {
166 /* Loop terminated early, cleanup. */ 166 /* Loop terminated early, cleanup. */
167 release_freepages(&freelist); 167 release_freepages(&freelist);
168 return 0; 168 return 0;
169 } 169 }
170 170
171 /* We don't use freelists for anything. */ 171 /* We don't use freelists for anything. */
172 return pfn; 172 return pfn;
173 } 173 }
174 174
175 /* Update the number of anon and file isolated pages in the zone */ 175 /* Update the number of anon and file isolated pages in the zone */
176 static void acct_isolated(struct zone *zone, struct compact_control *cc) 176 static void acct_isolated(struct zone *zone, struct compact_control *cc)
177 { 177 {
178 struct page *page; 178 struct page *page;
179 unsigned int count[2] = { 0, }; 179 unsigned int count[2] = { 0, };
180 180
181 list_for_each_entry(page, &cc->migratepages, lru) 181 list_for_each_entry(page, &cc->migratepages, lru)
182 count[!!page_is_file_cache(page)]++; 182 count[!!page_is_file_cache(page)]++;
183 183
184 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 184 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
185 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 185 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
186 } 186 }
187 187
188 /* Similar to reclaim, but different enough that they don't share logic */ 188 /* Similar to reclaim, but different enough that they don't share logic */
189 static bool too_many_isolated(struct zone *zone) 189 static bool too_many_isolated(struct zone *zone)
190 { 190 {
191 unsigned long active, inactive, isolated; 191 unsigned long active, inactive, isolated;
192 192
193 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 193 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
194 zone_page_state(zone, NR_INACTIVE_ANON); 194 zone_page_state(zone, NR_INACTIVE_ANON);
195 active = zone_page_state(zone, NR_ACTIVE_FILE) + 195 active = zone_page_state(zone, NR_ACTIVE_FILE) +
196 zone_page_state(zone, NR_ACTIVE_ANON); 196 zone_page_state(zone, NR_ACTIVE_ANON);
197 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 197 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
198 zone_page_state(zone, NR_ISOLATED_ANON); 198 zone_page_state(zone, NR_ISOLATED_ANON);
199 199
200 return isolated > (inactive + active) / 2; 200 return isolated > (inactive + active) / 2;
201 } 201 }
202 202
203 /** 203 /**
204 * isolate_migratepages_range() - isolate all migrate-able pages in range. 204 * isolate_migratepages_range() - isolate all migrate-able pages in range.
205 * @zone: Zone pages are in. 205 * @zone: Zone pages are in.
206 * @cc: Compaction control structure. 206 * @cc: Compaction control structure.
207 * @low_pfn: The first PFN of the range. 207 * @low_pfn: The first PFN of the range.
208 * @end_pfn: The one-past-the-last PFN of the range. 208 * @end_pfn: The one-past-the-last PFN of the range.
209 * 209 *
210 * Isolate all pages that can be migrated from the range specified by 210 * Isolate all pages that can be migrated from the range specified by
211 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 211 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
212 * pending), otherwise PFN of the first page that was not scanned 212 * pending), otherwise PFN of the first page that was not scanned
213 * (which may be both less, equal to or more then end_pfn). 213 * (which may be both less, equal to or more then end_pfn).
214 * 214 *
215 * Assumes that cc->migratepages is empty and cc->nr_migratepages is 215 * Assumes that cc->migratepages is empty and cc->nr_migratepages is
216 * zero. 216 * zero.
217 * 217 *
218 * Apart from cc->migratepages and cc->nr_migratetypes this function 218 * Apart from cc->migratepages and cc->nr_migratetypes this function
219 * does not modify any cc's fields, in particular it does not modify 219 * does not modify any cc's fields, in particular it does not modify
220 * (or read for that matter) cc->migrate_pfn. 220 * (or read for that matter) cc->migrate_pfn.
221 */ 221 */
222 unsigned long 222 unsigned long
223 isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 223 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
224 unsigned long low_pfn, unsigned long end_pfn) 224 unsigned long low_pfn, unsigned long end_pfn)
225 { 225 {
226 unsigned long last_pageblock_nr = 0, pageblock_nr; 226 unsigned long last_pageblock_nr = 0, pageblock_nr;
227 unsigned long nr_scanned = 0, nr_isolated = 0; 227 unsigned long nr_scanned = 0, nr_isolated = 0;
228 struct list_head *migratelist = &cc->migratepages; 228 struct list_head *migratelist = &cc->migratepages;
229 isolate_mode_t mode = 0; 229 isolate_mode_t mode = 0;
230 struct lruvec *lruvec; 230 struct lruvec *lruvec;
231 231
232 /* 232 /*
233 * Ensure that there are not too many pages isolated from the LRU 233 * Ensure that there are not too many pages isolated from the LRU
234 * list by either parallel reclaimers or compaction. If there are, 234 * list by either parallel reclaimers or compaction. If there are,
235 * delay for some time until fewer pages are isolated 235 * delay for some time until fewer pages are isolated
236 */ 236 */
237 while (unlikely(too_many_isolated(zone))) { 237 while (unlikely(too_many_isolated(zone))) {
238 /* async migration should just abort */ 238 /* async migration should just abort */
239 if (!cc->sync) 239 if (!cc->sync)
240 return 0; 240 return 0;
241 241
242 congestion_wait(BLK_RW_ASYNC, HZ/10); 242 congestion_wait(BLK_RW_ASYNC, HZ/10);
243 243
244 if (fatal_signal_pending(current)) 244 if (fatal_signal_pending(current))
245 return 0; 245 return 0;
246 } 246 }
247 247
248 /* Time to isolate some pages for migration */ 248 /* Time to isolate some pages for migration */
249 cond_resched(); 249 cond_resched();
250 spin_lock_irq(&zone->lru_lock); 250 spin_lock_irq(&zone->lru_lock);
251 for (; low_pfn < end_pfn; low_pfn++) { 251 for (; low_pfn < end_pfn; low_pfn++) {
252 struct page *page; 252 struct page *page;
253 bool locked = true; 253 bool locked = true;
254 254
255 /* give a chance to irqs before checking need_resched() */ 255 /* give a chance to irqs before checking need_resched() */
256 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 256 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257 spin_unlock_irq(&zone->lru_lock); 257 spin_unlock_irq(&zone->lru_lock);
258 locked = false; 258 locked = false;
259 } 259 }
260 if (need_resched() || spin_is_contended(&zone->lru_lock)) { 260 if (need_resched() || spin_is_contended(&zone->lru_lock)) {
261 if (locked) 261 if (locked)
262 spin_unlock_irq(&zone->lru_lock); 262 spin_unlock_irq(&zone->lru_lock);
263 cond_resched(); 263 cond_resched();
264 spin_lock_irq(&zone->lru_lock); 264 spin_lock_irq(&zone->lru_lock);
265 if (fatal_signal_pending(current)) 265 if (fatal_signal_pending(current))
266 break; 266 break;
267 } else if (!locked) 267 } else if (!locked)
268 spin_lock_irq(&zone->lru_lock); 268 spin_lock_irq(&zone->lru_lock);
269 269
270 /* 270 /*
271 * migrate_pfn does not necessarily start aligned to a 271 * migrate_pfn does not necessarily start aligned to a
272 * pageblock. Ensure that pfn_valid is called when moving 272 * pageblock. Ensure that pfn_valid is called when moving
273 * into a new MAX_ORDER_NR_PAGES range in case of large 273 * into a new MAX_ORDER_NR_PAGES range in case of large
274 * memory holes within the zone 274 * memory holes within the zone
275 */ 275 */
276 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 276 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
277 if (!pfn_valid(low_pfn)) { 277 if (!pfn_valid(low_pfn)) {
278 low_pfn += MAX_ORDER_NR_PAGES - 1; 278 low_pfn += MAX_ORDER_NR_PAGES - 1;
279 continue; 279 continue;
280 } 280 }
281 } 281 }
282 282
283 if (!pfn_valid_within(low_pfn)) 283 if (!pfn_valid_within(low_pfn))
284 continue; 284 continue;
285 nr_scanned++; 285 nr_scanned++;
286 286
287 /* 287 /*
288 * Get the page and ensure the page is within the same zone. 288 * Get the page and ensure the page is within the same zone.
289 * See the comment in isolate_freepages about overlapping 289 * See the comment in isolate_freepages about overlapping
290 * nodes. It is deliberate that the new zone lock is not taken 290 * nodes. It is deliberate that the new zone lock is not taken
291 * as memory compaction should not move pages between nodes. 291 * as memory compaction should not move pages between nodes.
292 */ 292 */
293 page = pfn_to_page(low_pfn); 293 page = pfn_to_page(low_pfn);
294 if (page_zone(page) != zone) 294 if (page_zone(page) != zone)
295 continue; 295 continue;
296 296
297 /* Skip if free */ 297 /* Skip if free */
298 if (PageBuddy(page)) 298 if (PageBuddy(page))
299 continue; 299 continue;
300 300
301 /* 301 /*
302 * For async migration, also only scan in MOVABLE blocks. Async 302 * For async migration, also only scan in MOVABLE blocks. Async
303 * migration is optimistic to see if the minimum amount of work 303 * migration is optimistic to see if the minimum amount of work
304 * satisfies the allocation 304 * satisfies the allocation
305 */ 305 */
306 pageblock_nr = low_pfn >> pageblock_order; 306 pageblock_nr = low_pfn >> pageblock_order;
307 if (!cc->sync && last_pageblock_nr != pageblock_nr && 307 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
308 !migrate_async_suitable(get_pageblock_migratetype(page))) { 308 !migrate_async_suitable(get_pageblock_migratetype(page))) {
309 low_pfn += pageblock_nr_pages; 309 low_pfn += pageblock_nr_pages;
310 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 310 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
311 last_pageblock_nr = pageblock_nr; 311 last_pageblock_nr = pageblock_nr;
312 continue; 312 continue;
313 } 313 }
314 314
315 if (!PageLRU(page)) 315 if (!PageLRU(page))
316 continue; 316 continue;
317 317
318 /* 318 /*
319 * PageLRU is set, and lru_lock excludes isolation, 319 * PageLRU is set, and lru_lock excludes isolation,
320 * splitting and collapsing (collapsing has already 320 * splitting and collapsing (collapsing has already
321 * happened if PageLRU is set). 321 * happened if PageLRU is set).
322 */ 322 */
323 if (PageTransHuge(page)) { 323 if (PageTransHuge(page)) {
324 low_pfn += (1 << compound_order(page)) - 1; 324 low_pfn += (1 << compound_order(page)) - 1;
325 continue; 325 continue;
326 } 326 }
327 327
328 if (!cc->sync) 328 if (!cc->sync)
329 mode |= ISOLATE_ASYNC_MIGRATE; 329 mode |= ISOLATE_ASYNC_MIGRATE;
330 330
331 lruvec = mem_cgroup_page_lruvec(page, zone); 331 lruvec = mem_cgroup_page_lruvec(page, zone);
332 332
333 /* Try isolate the page */ 333 /* Try isolate the page */
334 if (__isolate_lru_page(page, mode) != 0) 334 if (__isolate_lru_page(page, mode) != 0)
335 continue; 335 continue;
336 336
337 VM_BUG_ON(PageTransCompound(page)); 337 VM_BUG_ON(PageTransCompound(page));
338 338
339 /* Successfully isolated */ 339 /* Successfully isolated */
340 del_page_from_lru_list(page, lruvec, page_lru(page)); 340 del_page_from_lru_list(page, lruvec, page_lru(page));
341 list_add(&page->lru, migratelist); 341 list_add(&page->lru, migratelist);
342 cc->nr_migratepages++; 342 cc->nr_migratepages++;
343 nr_isolated++; 343 nr_isolated++;
344 344
345 /* Avoid isolating too much */ 345 /* Avoid isolating too much */
346 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 346 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
347 ++low_pfn; 347 ++low_pfn;
348 break; 348 break;
349 } 349 }
350 } 350 }
351 351
352 acct_isolated(zone, cc); 352 acct_isolated(zone, cc);
353 353
354 spin_unlock_irq(&zone->lru_lock); 354 spin_unlock_irq(&zone->lru_lock);
355 355
356 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 356 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357 357
358 return low_pfn; 358 return low_pfn;
359 } 359 }
360 360
361 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 361 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
362 #ifdef CONFIG_COMPACTION 362 #ifdef CONFIG_COMPACTION
363 363
364 /* Returns true if the page is within a block suitable for migration to */ 364 /* Returns true if the page is within a block suitable for migration to */
365 static bool suitable_migration_target(struct page *page) 365 static bool suitable_migration_target(struct page *page)
366 { 366 {
367 367
368 int migratetype = get_pageblock_migratetype(page); 368 int migratetype = get_pageblock_migratetype(page);
369 369
370 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 370 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
371 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 371 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
372 return false; 372 return false;
373 373
374 /* If the page is a large free page, then allow migration */ 374 /* If the page is a large free page, then allow migration */
375 if (PageBuddy(page) && page_order(page) >= pageblock_order) 375 if (PageBuddy(page) && page_order(page) >= pageblock_order)
376 return true; 376 return true;
377 377
378 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 378 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
379 if (migrate_async_suitable(migratetype)) 379 if (migrate_async_suitable(migratetype))
380 return true; 380 return true;
381 381
382 /* Otherwise skip the block */ 382 /* Otherwise skip the block */
383 return false; 383 return false;
384 } 384 }
385 385
386 /* 386 /*
387 * Based on information in the current compact_control, find blocks 387 * Based on information in the current compact_control, find blocks
388 * suitable for isolating free pages from and then isolate them. 388 * suitable for isolating free pages from and then isolate them.
389 */ 389 */
390 static void isolate_freepages(struct zone *zone, 390 static void isolate_freepages(struct zone *zone,
391 struct compact_control *cc) 391 struct compact_control *cc)
392 { 392 {
393 struct page *page; 393 struct page *page;
394 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 394 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
395 unsigned long flags; 395 unsigned long flags;
396 int nr_freepages = cc->nr_freepages; 396 int nr_freepages = cc->nr_freepages;
397 struct list_head *freelist = &cc->freepages; 397 struct list_head *freelist = &cc->freepages;
398 398
399 /* 399 /*
400 * Initialise the free scanner. The starting point is where we last 400 * Initialise the free scanner. The starting point is where we last
401 * scanned from (or the end of the zone if starting). The low point 401 * scanned from (or the end of the zone if starting). The low point
402 * is the end of the pageblock the migration scanner is using. 402 * is the end of the pageblock the migration scanner is using.
403 */ 403 */
404 pfn = cc->free_pfn; 404 pfn = cc->free_pfn;
405 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 405 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
406 406
407 /* 407 /*
408 * Take care that if the migration scanner is at the end of the zone 408 * Take care that if the migration scanner is at the end of the zone
409 * that the free scanner does not accidentally move to the next zone 409 * that the free scanner does not accidentally move to the next zone
410 * in the next isolation cycle. 410 * in the next isolation cycle.
411 */ 411 */
412 high_pfn = min(low_pfn, pfn); 412 high_pfn = min(low_pfn, pfn);
413 413
414 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 414 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
415 415
416 /* 416 /*
417 * Isolate free pages until enough are available to migrate the 417 * Isolate free pages until enough are available to migrate the
418 * pages on cc->migratepages. We stop searching if the migrate 418 * pages on cc->migratepages. We stop searching if the migrate
419 * and free page scanners meet or enough free pages are isolated. 419 * and free page scanners meet or enough free pages are isolated.
420 */ 420 */
421 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 421 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
422 pfn -= pageblock_nr_pages) { 422 pfn -= pageblock_nr_pages) {
423 unsigned long isolated; 423 unsigned long isolated;
424 424
425 /*
426 * Skip ahead if another thread is compacting in the area
427 * simultaneously. If we wrapped around, we can only skip
428 * ahead if zone->compact_cached_free_pfn also wrapped to
429 * above our starting point.
430 */
431 if (cc->order > 0 && (!cc->wrapped ||
432 zone->compact_cached_free_pfn >
433 cc->start_free_pfn))
434 pfn = min(pfn, zone->compact_cached_free_pfn);
435
425 if (!pfn_valid(pfn)) 436 if (!pfn_valid(pfn))
426 continue; 437 continue;
427 438
428 /* 439 /*
429 * Check for overlapping nodes/zones. It's possible on some 440 * Check for overlapping nodes/zones. It's possible on some
430 * configurations to have a setup like 441 * configurations to have a setup like
431 * node0 node1 node0 442 * node0 node1 node0
432 * i.e. it's possible that all pages within a zones range of 443 * i.e. it's possible that all pages within a zones range of
433 * pages do not belong to a single zone. 444 * pages do not belong to a single zone.
434 */ 445 */
435 page = pfn_to_page(pfn); 446 page = pfn_to_page(pfn);
436 if (page_zone(page) != zone) 447 if (page_zone(page) != zone)
437 continue; 448 continue;
438 449
439 /* Check the block is suitable for migration */ 450 /* Check the block is suitable for migration */
440 if (!suitable_migration_target(page)) 451 if (!suitable_migration_target(page))
441 continue; 452 continue;
442 453
443 /* 454 /*
444 * Found a block suitable for isolating free pages from. Now 455 * Found a block suitable for isolating free pages from. Now
445 * we disabled interrupts, double check things are ok and 456 * we disabled interrupts, double check things are ok and
446 * isolate the pages. This is to minimise the time IRQs 457 * isolate the pages. This is to minimise the time IRQs
447 * are disabled 458 * are disabled
448 */ 459 */
449 isolated = 0; 460 isolated = 0;
450 spin_lock_irqsave(&zone->lock, flags); 461 spin_lock_irqsave(&zone->lock, flags);
451 if (suitable_migration_target(page)) { 462 if (suitable_migration_target(page)) {
452 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 463 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
453 isolated = isolate_freepages_block(pfn, end_pfn, 464 isolated = isolate_freepages_block(pfn, end_pfn,
454 freelist, false); 465 freelist, false);
455 nr_freepages += isolated; 466 nr_freepages += isolated;
456 } 467 }
457 spin_unlock_irqrestore(&zone->lock, flags); 468 spin_unlock_irqrestore(&zone->lock, flags);
458 469
459 /* 470 /*
460 * Record the highest PFN we isolated pages from. When next 471 * Record the highest PFN we isolated pages from. When next
461 * looking for free pages, the search will restart here as 472 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator 473 * page migration may have returned some pages to the allocator
463 */ 474 */
464 if (isolated) 475 if (isolated) {
465 high_pfn = max(high_pfn, pfn); 476 high_pfn = max(high_pfn, pfn);
477 if (cc->order > 0)
478 zone->compact_cached_free_pfn = high_pfn;
479 }
466 } 480 }
467 481
468 /* split_free_page does not map the pages */ 482 /* split_free_page does not map the pages */
469 map_pages(freelist); 483 map_pages(freelist);
470 484
471 cc->free_pfn = high_pfn; 485 cc->free_pfn = high_pfn;
472 cc->nr_freepages = nr_freepages; 486 cc->nr_freepages = nr_freepages;
473 } 487 }
474 488
475 /* 489 /*
476 * This is a migrate-callback that "allocates" freepages by taking pages 490 * This is a migrate-callback that "allocates" freepages by taking pages
477 * from the isolated freelists in the block we are migrating to. 491 * from the isolated freelists in the block we are migrating to.
478 */ 492 */
479 static struct page *compaction_alloc(struct page *migratepage, 493 static struct page *compaction_alloc(struct page *migratepage,
480 unsigned long data, 494 unsigned long data,
481 int **result) 495 int **result)
482 { 496 {
483 struct compact_control *cc = (struct compact_control *)data; 497 struct compact_control *cc = (struct compact_control *)data;
484 struct page *freepage; 498 struct page *freepage;
485 499
486 /* Isolate free pages if necessary */ 500 /* Isolate free pages if necessary */
487 if (list_empty(&cc->freepages)) { 501 if (list_empty(&cc->freepages)) {
488 isolate_freepages(cc->zone, cc); 502 isolate_freepages(cc->zone, cc);
489 503
490 if (list_empty(&cc->freepages)) 504 if (list_empty(&cc->freepages))
491 return NULL; 505 return NULL;
492 } 506 }
493 507
494 freepage = list_entry(cc->freepages.next, struct page, lru); 508 freepage = list_entry(cc->freepages.next, struct page, lru);
495 list_del(&freepage->lru); 509 list_del(&freepage->lru);
496 cc->nr_freepages--; 510 cc->nr_freepages--;
497 511
498 return freepage; 512 return freepage;
499 } 513 }
500 514
501 /* 515 /*
502 * We cannot control nr_migratepages and nr_freepages fully when migration is 516 * We cannot control nr_migratepages and nr_freepages fully when migration is
503 * running as migrate_pages() has no knowledge of compact_control. When 517 * running as migrate_pages() has no knowledge of compact_control. When
504 * migration is complete, we count the number of pages on the lists by hand. 518 * migration is complete, we count the number of pages on the lists by hand.
505 */ 519 */
506 static void update_nr_listpages(struct compact_control *cc) 520 static void update_nr_listpages(struct compact_control *cc)
507 { 521 {
508 int nr_migratepages = 0; 522 int nr_migratepages = 0;
509 int nr_freepages = 0; 523 int nr_freepages = 0;
510 struct page *page; 524 struct page *page;
511 525
512 list_for_each_entry(page, &cc->migratepages, lru) 526 list_for_each_entry(page, &cc->migratepages, lru)
513 nr_migratepages++; 527 nr_migratepages++;
514 list_for_each_entry(page, &cc->freepages, lru) 528 list_for_each_entry(page, &cc->freepages, lru)
515 nr_freepages++; 529 nr_freepages++;
516 530
517 cc->nr_migratepages = nr_migratepages; 531 cc->nr_migratepages = nr_migratepages;
518 cc->nr_freepages = nr_freepages; 532 cc->nr_freepages = nr_freepages;
519 } 533 }
520 534
521 /* possible outcome of isolate_migratepages */ 535 /* possible outcome of isolate_migratepages */
522 typedef enum { 536 typedef enum {
523 ISOLATE_ABORT, /* Abort compaction now */ 537 ISOLATE_ABORT, /* Abort compaction now */
524 ISOLATE_NONE, /* No pages isolated, continue scanning */ 538 ISOLATE_NONE, /* No pages isolated, continue scanning */
525 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 539 ISOLATE_SUCCESS, /* Pages isolated, migrate */
526 } isolate_migrate_t; 540 } isolate_migrate_t;
527 541
528 /* 542 /*
529 * Isolate all pages that can be migrated from the block pointed to by 543 * Isolate all pages that can be migrated from the block pointed to by
530 * the migrate scanner within compact_control. 544 * the migrate scanner within compact_control.
531 */ 545 */
532 static isolate_migrate_t isolate_migratepages(struct zone *zone, 546 static isolate_migrate_t isolate_migratepages(struct zone *zone,
533 struct compact_control *cc) 547 struct compact_control *cc)
534 { 548 {
535 unsigned long low_pfn, end_pfn; 549 unsigned long low_pfn, end_pfn;
536 550
537 /* Do not scan outside zone boundaries */ 551 /* Do not scan outside zone boundaries */
538 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 552 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
539 553
540 /* Only scan within a pageblock boundary */ 554 /* Only scan within a pageblock boundary */
541 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); 555 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
542 556
543 /* Do not cross the free scanner or scan within a memory hole */ 557 /* Do not cross the free scanner or scan within a memory hole */
544 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 558 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
545 cc->migrate_pfn = end_pfn; 559 cc->migrate_pfn = end_pfn;
546 return ISOLATE_NONE; 560 return ISOLATE_NONE;
547 } 561 }
548 562
549 /* Perform the isolation */ 563 /* Perform the isolation */
550 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); 564 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
551 if (!low_pfn) 565 if (!low_pfn)
552 return ISOLATE_ABORT; 566 return ISOLATE_ABORT;
553 567
554 cc->migrate_pfn = low_pfn; 568 cc->migrate_pfn = low_pfn;
555 569
556 return ISOLATE_SUCCESS; 570 return ISOLATE_SUCCESS;
557 } 571 }
558 572
573 /*
574 * Returns the start pfn of the last page block in a zone. This is the starting
575 * point for full compaction of a zone. Compaction searches for free pages from
576 * the end of each zone, while isolate_freepages_block scans forward inside each
577 * page block.
578 */
579 static unsigned long start_free_pfn(struct zone *zone)
580 {
581 unsigned long free_pfn;
582 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
583 free_pfn &= ~(pageblock_nr_pages-1);
584 return free_pfn;
585 }
586
559 static int compact_finished(struct zone *zone, 587 static int compact_finished(struct zone *zone,
560 struct compact_control *cc) 588 struct compact_control *cc)
561 { 589 {
562 unsigned int order; 590 unsigned int order;
563 unsigned long watermark; 591 unsigned long watermark;
564 592
565 if (fatal_signal_pending(current)) 593 if (fatal_signal_pending(current))
566 return COMPACT_PARTIAL; 594 return COMPACT_PARTIAL;
567 595
568 /* Compaction run completes if the migrate and free scanner meet */ 596 /*
569 if (cc->free_pfn <= cc->migrate_pfn) 597 * A full (order == -1) compaction run starts at the beginning and
598 * end of a zone; it completes when the migrate and free scanner meet.
599 * A partial (order > 0) compaction can start with the free scanner
600 * at a random point in the zone, and may have to restart.
601 */
602 if (cc->free_pfn <= cc->migrate_pfn) {
603 if (cc->order > 0 && !cc->wrapped) {
604 /* We started partway through; restart at the end. */
605 unsigned long free_pfn = start_free_pfn(zone);
606 zone->compact_cached_free_pfn = free_pfn;
607 cc->free_pfn = free_pfn;
608 cc->wrapped = 1;
609 return COMPACT_CONTINUE;
610 }
570 return COMPACT_COMPLETE; 611 return COMPACT_COMPLETE;
612 }
571 613
614 /* We wrapped around and ended up where we started. */
615 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
616 return COMPACT_COMPLETE;
617
572 /* 618 /*
573 * order == -1 is expected when compacting via 619 * order == -1 is expected when compacting via
574 * /proc/sys/vm/compact_memory 620 * /proc/sys/vm/compact_memory
575 */ 621 */
576 if (cc->order == -1) 622 if (cc->order == -1)
577 return COMPACT_CONTINUE; 623 return COMPACT_CONTINUE;
578 624
579 /* Compaction run is not finished if the watermark is not met */ 625 /* Compaction run is not finished if the watermark is not met */
580 watermark = low_wmark_pages(zone); 626 watermark = low_wmark_pages(zone);
581 watermark += (1 << cc->order); 627 watermark += (1 << cc->order);
582 628
583 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 629 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
584 return COMPACT_CONTINUE; 630 return COMPACT_CONTINUE;
585 631
586 /* Direct compactor: Is a suitable page free? */ 632 /* Direct compactor: Is a suitable page free? */
587 for (order = cc->order; order < MAX_ORDER; order++) { 633 for (order = cc->order; order < MAX_ORDER; order++) {
588 /* Job done if page is free of the right migratetype */ 634 /* Job done if page is free of the right migratetype */
589 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) 635 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
590 return COMPACT_PARTIAL; 636 return COMPACT_PARTIAL;
591 637
592 /* Job done if allocation would set block type */ 638 /* Job done if allocation would set block type */
593 if (order >= pageblock_order && zone->free_area[order].nr_free) 639 if (order >= pageblock_order && zone->free_area[order].nr_free)
594 return COMPACT_PARTIAL; 640 return COMPACT_PARTIAL;
595 } 641 }
596 642
597 return COMPACT_CONTINUE; 643 return COMPACT_CONTINUE;
598 } 644 }
599 645
600 /* 646 /*
601 * compaction_suitable: Is this suitable to run compaction on this zone now? 647 * compaction_suitable: Is this suitable to run compaction on this zone now?
602 * Returns 648 * Returns
603 * COMPACT_SKIPPED - If there are too few free pages for compaction 649 * COMPACT_SKIPPED - If there are too few free pages for compaction
604 * COMPACT_PARTIAL - If the allocation would succeed without compaction 650 * COMPACT_PARTIAL - If the allocation would succeed without compaction
605 * COMPACT_CONTINUE - If compaction should run now 651 * COMPACT_CONTINUE - If compaction should run now
606 */ 652 */
607 unsigned long compaction_suitable(struct zone *zone, int order) 653 unsigned long compaction_suitable(struct zone *zone, int order)
608 { 654 {
609 int fragindex; 655 int fragindex;
610 unsigned long watermark; 656 unsigned long watermark;
611 657
612 /* 658 /*
613 * order == -1 is expected when compacting via 659 * order == -1 is expected when compacting via
614 * /proc/sys/vm/compact_memory 660 * /proc/sys/vm/compact_memory
615 */ 661 */
616 if (order == -1) 662 if (order == -1)
617 return COMPACT_CONTINUE; 663 return COMPACT_CONTINUE;
618 664
619 /* 665 /*
620 * Watermarks for order-0 must be met for compaction. Note the 2UL. 666 * Watermarks for order-0 must be met for compaction. Note the 2UL.
621 * This is because during migration, copies of pages need to be 667 * This is because during migration, copies of pages need to be
622 * allocated and for a short time, the footprint is higher 668 * allocated and for a short time, the footprint is higher
623 */ 669 */
624 watermark = low_wmark_pages(zone) + (2UL << order); 670 watermark = low_wmark_pages(zone) + (2UL << order);
625 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 671 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
626 return COMPACT_SKIPPED; 672 return COMPACT_SKIPPED;
627 673
628 /* 674 /*
629 * fragmentation index determines if allocation failures are due to 675 * fragmentation index determines if allocation failures are due to
630 * low memory or external fragmentation 676 * low memory or external fragmentation
631 * 677 *
632 * index of -1000 implies allocations might succeed depending on 678 * index of -1000 implies allocations might succeed depending on
633 * watermarks 679 * watermarks
634 * index towards 0 implies failure is due to lack of memory 680 * index towards 0 implies failure is due to lack of memory
635 * index towards 1000 implies failure is due to fragmentation 681 * index towards 1000 implies failure is due to fragmentation
636 * 682 *
637 * Only compact if a failure would be due to fragmentation. 683 * Only compact if a failure would be due to fragmentation.
638 */ 684 */
639 fragindex = fragmentation_index(zone, order); 685 fragindex = fragmentation_index(zone, order);
640 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 686 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
641 return COMPACT_SKIPPED; 687 return COMPACT_SKIPPED;
642 688
643 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, 689 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
644 0, 0)) 690 0, 0))
645 return COMPACT_PARTIAL; 691 return COMPACT_PARTIAL;
646 692
647 return COMPACT_CONTINUE; 693 return COMPACT_CONTINUE;
648 } 694 }
649 695
650 static int compact_zone(struct zone *zone, struct compact_control *cc) 696 static int compact_zone(struct zone *zone, struct compact_control *cc)
651 { 697 {
652 int ret; 698 int ret;
653 699
654 ret = compaction_suitable(zone, cc->order); 700 ret = compaction_suitable(zone, cc->order);
655 switch (ret) { 701 switch (ret) {
656 case COMPACT_PARTIAL: 702 case COMPACT_PARTIAL:
657 case COMPACT_SKIPPED: 703 case COMPACT_SKIPPED:
658 /* Compaction is likely to fail */ 704 /* Compaction is likely to fail */
659 return ret; 705 return ret;
660 case COMPACT_CONTINUE: 706 case COMPACT_CONTINUE:
661 /* Fall through to compaction */ 707 /* Fall through to compaction */
662 ; 708 ;
663 } 709 }
664 710
665 /* Setup to move all movable pages to the end of the zone */ 711 /* Setup to move all movable pages to the end of the zone */
666 cc->migrate_pfn = zone->zone_start_pfn; 712 cc->migrate_pfn = zone->zone_start_pfn;
667 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 713
668 cc->free_pfn &= ~(pageblock_nr_pages-1); 714 if (cc->order > 0) {
715 /* Incremental compaction. Start where the last one stopped. */
716 cc->free_pfn = zone->compact_cached_free_pfn;
717 cc->start_free_pfn = cc->free_pfn;
718 } else {
719 /* Order == -1 starts at the end of the zone. */
720 cc->free_pfn = start_free_pfn(zone);
721 }
669 722
670 migrate_prep_local(); 723 migrate_prep_local();
671 724
672 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 725 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
673 unsigned long nr_migrate, nr_remaining; 726 unsigned long nr_migrate, nr_remaining;
674 int err; 727 int err;
675 728
676 switch (isolate_migratepages(zone, cc)) { 729 switch (isolate_migratepages(zone, cc)) {
677 case ISOLATE_ABORT: 730 case ISOLATE_ABORT:
678 ret = COMPACT_PARTIAL; 731 ret = COMPACT_PARTIAL;
679 goto out; 732 goto out;
680 case ISOLATE_NONE: 733 case ISOLATE_NONE:
681 continue; 734 continue;
682 case ISOLATE_SUCCESS: 735 case ISOLATE_SUCCESS:
683 ; 736 ;
684 } 737 }
685 738
686 nr_migrate = cc->nr_migratepages; 739 nr_migrate = cc->nr_migratepages;
687 err = migrate_pages(&cc->migratepages, compaction_alloc, 740 err = migrate_pages(&cc->migratepages, compaction_alloc,
688 (unsigned long)cc, false, 741 (unsigned long)cc, false,
689 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 742 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
690 update_nr_listpages(cc); 743 update_nr_listpages(cc);
691 nr_remaining = cc->nr_migratepages; 744 nr_remaining = cc->nr_migratepages;
692 745
693 count_vm_event(COMPACTBLOCKS); 746 count_vm_event(COMPACTBLOCKS);
694 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); 747 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
695 if (nr_remaining) 748 if (nr_remaining)
696 count_vm_events(COMPACTPAGEFAILED, nr_remaining); 749 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
697 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 750 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
698 nr_remaining); 751 nr_remaining);
699 752
700 /* Release LRU pages not migrated */ 753 /* Release LRU pages not migrated */
701 if (err) { 754 if (err) {
702 putback_lru_pages(&cc->migratepages); 755 putback_lru_pages(&cc->migratepages);
703 cc->nr_migratepages = 0; 756 cc->nr_migratepages = 0;
704 if (err == -ENOMEM) { 757 if (err == -ENOMEM) {
705 ret = COMPACT_PARTIAL; 758 ret = COMPACT_PARTIAL;
706 goto out; 759 goto out;
707 } 760 }
708 } 761 }
709 } 762 }
710 763
711 out: 764 out:
712 /* Release free pages and check accounting */ 765 /* Release free pages and check accounting */
713 cc->nr_freepages -= release_freepages(&cc->freepages); 766 cc->nr_freepages -= release_freepages(&cc->freepages);
714 VM_BUG_ON(cc->nr_freepages != 0); 767 VM_BUG_ON(cc->nr_freepages != 0);
715 768
716 return ret; 769 return ret;
717 } 770 }
718 771
719 static unsigned long compact_zone_order(struct zone *zone, 772 static unsigned long compact_zone_order(struct zone *zone,
720 int order, gfp_t gfp_mask, 773 int order, gfp_t gfp_mask,
721 bool sync) 774 bool sync)
722 { 775 {
723 struct compact_control cc = { 776 struct compact_control cc = {
724 .nr_freepages = 0, 777 .nr_freepages = 0,
725 .nr_migratepages = 0, 778 .nr_migratepages = 0,
726 .order = order, 779 .order = order,
727 .migratetype = allocflags_to_migratetype(gfp_mask), 780 .migratetype = allocflags_to_migratetype(gfp_mask),
728 .zone = zone, 781 .zone = zone,
729 .sync = sync, 782 .sync = sync,
730 }; 783 };
731 INIT_LIST_HEAD(&cc.freepages); 784 INIT_LIST_HEAD(&cc.freepages);
732 INIT_LIST_HEAD(&cc.migratepages); 785 INIT_LIST_HEAD(&cc.migratepages);
733 786
734 return compact_zone(zone, &cc); 787 return compact_zone(zone, &cc);
735 } 788 }
736 789
737 int sysctl_extfrag_threshold = 500; 790 int sysctl_extfrag_threshold = 500;
738 791
739 /** 792 /**
740 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 793 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
741 * @zonelist: The zonelist used for the current allocation 794 * @zonelist: The zonelist used for the current allocation
742 * @order: The order of the current allocation 795 * @order: The order of the current allocation
743 * @gfp_mask: The GFP mask of the current allocation 796 * @gfp_mask: The GFP mask of the current allocation
744 * @nodemask: The allowed nodes to allocate from 797 * @nodemask: The allowed nodes to allocate from
745 * @sync: Whether migration is synchronous or not 798 * @sync: Whether migration is synchronous or not
746 * 799 *
747 * This is the main entry point for direct page compaction. 800 * This is the main entry point for direct page compaction.
748 */ 801 */
749 unsigned long try_to_compact_pages(struct zonelist *zonelist, 802 unsigned long try_to_compact_pages(struct zonelist *zonelist,
750 int order, gfp_t gfp_mask, nodemask_t *nodemask, 803 int order, gfp_t gfp_mask, nodemask_t *nodemask,
751 bool sync) 804 bool sync)
752 { 805 {
753 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 806 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
754 int may_enter_fs = gfp_mask & __GFP_FS; 807 int may_enter_fs = gfp_mask & __GFP_FS;
755 int may_perform_io = gfp_mask & __GFP_IO; 808 int may_perform_io = gfp_mask & __GFP_IO;
756 struct zoneref *z; 809 struct zoneref *z;
757 struct zone *zone; 810 struct zone *zone;
758 int rc = COMPACT_SKIPPED; 811 int rc = COMPACT_SKIPPED;
759 812
760 /* 813 /*
761 * Check whether it is worth even starting compaction. The order check is 814 * Check whether it is worth even starting compaction. The order check is
762 * made because an assumption is made that the page allocator can satisfy 815 * made because an assumption is made that the page allocator can satisfy
763 * the "cheaper" orders without taking special steps 816 * the "cheaper" orders without taking special steps
764 */ 817 */
765 if (!order || !may_enter_fs || !may_perform_io) 818 if (!order || !may_enter_fs || !may_perform_io)
766 return rc; 819 return rc;
767 820
768 count_vm_event(COMPACTSTALL); 821 count_vm_event(COMPACTSTALL);
769 822
770 /* Compact each zone in the list */ 823 /* Compact each zone in the list */
771 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 824 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
772 nodemask) { 825 nodemask) {
773 int status; 826 int status;
774 827
775 status = compact_zone_order(zone, order, gfp_mask, sync); 828 status = compact_zone_order(zone, order, gfp_mask, sync);
776 rc = max(status, rc); 829 rc = max(status, rc);
777 830
778 /* If a normal allocation would succeed, stop compacting */ 831 /* If a normal allocation would succeed, stop compacting */
779 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) 832 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
780 break; 833 break;
781 } 834 }
782 835
783 return rc; 836 return rc;
784 } 837 }
785 838
786 839
787 /* Compact all zones within a node */ 840 /* Compact all zones within a node */
788 static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 841 static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
789 { 842 {
790 int zoneid; 843 int zoneid;
791 struct zone *zone; 844 struct zone *zone;
792 845
793 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 846 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
794 847
795 zone = &pgdat->node_zones[zoneid]; 848 zone = &pgdat->node_zones[zoneid];
796 if (!populated_zone(zone)) 849 if (!populated_zone(zone))
797 continue; 850 continue;
798 851
799 cc->nr_freepages = 0; 852 cc->nr_freepages = 0;
800 cc->nr_migratepages = 0; 853 cc->nr_migratepages = 0;
801 cc->zone = zone; 854 cc->zone = zone;
802 INIT_LIST_HEAD(&cc->freepages); 855 INIT_LIST_HEAD(&cc->freepages);
803 INIT_LIST_HEAD(&cc->migratepages); 856 INIT_LIST_HEAD(&cc->migratepages);
804 857
805 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 858 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
806 compact_zone(zone, cc); 859 compact_zone(zone, cc);
807 860
808 if (cc->order > 0) { 861 if (cc->order > 0) {
809 int ok = zone_watermark_ok(zone, cc->order, 862 int ok = zone_watermark_ok(zone, cc->order,
810 low_wmark_pages(zone), 0, 0); 863 low_wmark_pages(zone), 0, 0);
811 if (ok && cc->order > zone->compact_order_failed) 864 if (ok && cc->order > zone->compact_order_failed)
812 zone->compact_order_failed = cc->order + 1; 865 zone->compact_order_failed = cc->order + 1;
813 /* Currently async compaction is never deferred. */ 866 /* Currently async compaction is never deferred. */
814 else if (!ok && cc->sync) 867 else if (!ok && cc->sync)
815 defer_compaction(zone, cc->order); 868 defer_compaction(zone, cc->order);
816 } 869 }
817 870
818 VM_BUG_ON(!list_empty(&cc->freepages)); 871 VM_BUG_ON(!list_empty(&cc->freepages));
819 VM_BUG_ON(!list_empty(&cc->migratepages)); 872 VM_BUG_ON(!list_empty(&cc->migratepages));
820 } 873 }
821 874
822 return 0; 875 return 0;
823 } 876 }
824 877
825 int compact_pgdat(pg_data_t *pgdat, int order) 878 int compact_pgdat(pg_data_t *pgdat, int order)
826 { 879 {
827 struct compact_control cc = { 880 struct compact_control cc = {
828 .order = order, 881 .order = order,
829 .sync = false, 882 .sync = false,
830 }; 883 };
831 884
832 return __compact_pgdat(pgdat, &cc); 885 return __compact_pgdat(pgdat, &cc);
833 } 886 }
834 887
835 static int compact_node(int nid) 888 static int compact_node(int nid)
836 { 889 {
837 struct compact_control cc = { 890 struct compact_control cc = {
838 .order = -1, 891 .order = -1,
839 .sync = true, 892 .sync = true,
840 }; 893 };
841 894
842 return __compact_pgdat(NODE_DATA(nid), &cc); 895 return __compact_pgdat(NODE_DATA(nid), &cc);
843 } 896 }
844 897
845 /* Compact all nodes in the system */ 898 /* Compact all nodes in the system */
846 static int compact_nodes(void) 899 static int compact_nodes(void)
847 { 900 {
848 int nid; 901 int nid;
849 902
850 /* Flush pending updates to the LRU lists */ 903 /* Flush pending updates to the LRU lists */
851 lru_add_drain_all(); 904 lru_add_drain_all();
852 905
853 for_each_online_node(nid) 906 for_each_online_node(nid)
854 compact_node(nid); 907 compact_node(nid);
855 908
856 return COMPACT_COMPLETE; 909 return COMPACT_COMPLETE;
857 } 910 }
858 911
859 /* The written value is actually unused, all memory is compacted */ 912 /* The written value is actually unused, all memory is compacted */
860 int sysctl_compact_memory; 913 int sysctl_compact_memory;
861 914
862 /* This is the entry point for compacting all nodes via /proc/sys/vm */ 915 /* This is the entry point for compacting all nodes via /proc/sys/vm */
863 int sysctl_compaction_handler(struct ctl_table *table, int write, 916 int sysctl_compaction_handler(struct ctl_table *table, int write,
864 void __user *buffer, size_t *length, loff_t *ppos) 917 void __user *buffer, size_t *length, loff_t *ppos)
865 { 918 {
866 if (write) 919 if (write)
867 return compact_nodes(); 920 return compact_nodes();
868 921
869 return 0; 922 return 0;
870 } 923 }
871 924
872 int sysctl_extfrag_handler(struct ctl_table *table, int write, 925 int sysctl_extfrag_handler(struct ctl_table *table, int write,
873 void __user *buffer, size_t *length, loff_t *ppos) 926 void __user *buffer, size_t *length, loff_t *ppos)
874 { 927 {
875 proc_dointvec_minmax(table, write, buffer, length, ppos); 928 proc_dointvec_minmax(table, write, buffer, length, ppos);
876 929
877 return 0; 930 return 0;
878 } 931 }
879 932
880 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 933 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
881 ssize_t sysfs_compact_node(struct device *dev, 934 ssize_t sysfs_compact_node(struct device *dev,
882 struct device_attribute *attr, 935 struct device_attribute *attr,
883 const char *buf, size_t count) 936 const char *buf, size_t count)
884 { 937 {
885 int nid = dev->id; 938 int nid = dev->id;
886 939
887 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 940 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
888 /* Flush pending updates to the LRU lists */ 941 /* Flush pending updates to the LRU lists */
889 lru_add_drain_all(); 942 lru_add_drain_all();
890 943
891 compact_node(nid); 944 compact_node(nid);
892 } 945 }
893 946
894 return count; 947 return count;
895 } 948 }
896 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 949 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
897 950
898 int compaction_register_node(struct node *node) 951 int compaction_register_node(struct node *node)
899 { 952 {
900 return device_create_file(&node->dev, &dev_attr_compact); 953 return device_create_file(&node->dev, &dev_attr_compact);
901 } 954 }
902 955
903 void compaction_unregister_node(struct node *node) 956 void compaction_unregister_node(struct node *node)
904 { 957 {
905 return device_remove_file(&node->dev, &dev_attr_compact); 958 return device_remove_file(&node->dev, &dev_attr_compact);
906 } 959 }
907 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 960 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
908 961
909 #endif /* CONFIG_COMPACTION */ 962 #endif /* CONFIG_COMPACTION */
910 963
1 /* internal.h: mm/ internal definitions 1 /* internal.h: mm/ internal definitions
2 * 2 *
3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 #ifndef __MM_INTERNAL_H 11 #ifndef __MM_INTERNAL_H
12 #define __MM_INTERNAL_H 12 #define __MM_INTERNAL_H
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 15
16 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19 static inline void set_page_count(struct page *page, int v) 19 static inline void set_page_count(struct page *page, int v)
20 { 20 {
21 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
22 } 22 }
23 23
24 /* 24 /*
25 * Turn a non-refcounted page (->_count == 0) into refcounted with 25 * Turn a non-refcounted page (->_count == 0) into refcounted with
26 * a count of one. 26 * a count of one.
27 */ 27 */
28 static inline void set_page_refcounted(struct page *page) 28 static inline void set_page_refcounted(struct page *page)
29 { 29 {
30 VM_BUG_ON(PageTail(page)); 30 VM_BUG_ON(PageTail(page));
31 VM_BUG_ON(atomic_read(&page->_count)); 31 VM_BUG_ON(atomic_read(&page->_count));
32 set_page_count(page, 1); 32 set_page_count(page, 1);
33 } 33 }
34 34
35 static inline void __put_page(struct page *page) 35 static inline void __put_page(struct page *page)
36 { 36 {
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38 } 38 }
39 39
40 static inline void __get_page_tail_foll(struct page *page, 40 static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head) 41 bool get_page_head)
42 { 42 {
43 /* 43 /*
44 * If we're getting a tail page, the elevated page->_count is 44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head 45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount. 46 * page->_count and tail page->_mapcount.
47 * 47 *
48 * We elevate page_tail->_mapcount for tail pages to force 48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting 49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with 50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in 51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages. 52 * page_cache_get_speculative()) on tail pages.
53 */ 53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0); 55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0); 56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head) 57 if (get_page_head)
58 atomic_inc(&page->first_page->_count); 58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount); 59 atomic_inc(&page->_mapcount);
60 } 60 }
61 61
62 /* 62 /*
63 * This is meant to be called as the FOLL_GET operation of 63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT 64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page. 65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */ 66 */
67 static inline void get_page_foll(struct page *page) 67 static inline void get_page_foll(struct page *page)
68 { 68 {
69 if (unlikely(PageTail(page))) 69 if (unlikely(PageTail(page)))
70 /* 70 /*
71 * This is safe only because 71 * This is safe only because
72 * __split_huge_page_refcount() can't run under 72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock. 73 * get_page_foll() because we hold the proper PT lock.
74 */ 74 */
75 __get_page_tail_foll(page, true); 75 __get_page_tail_foll(page, true);
76 else { 76 else {
77 /* 77 /*
78 * Getting a normal page or the head of a compound page 78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count. 79 * requires to already have an elevated page->_count.
80 */ 80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0); 81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count); 82 atomic_inc(&page->_count);
83 } 83 }
84 } 84 }
85 85
86 extern unsigned long highest_memmap_pfn; 86 extern unsigned long highest_memmap_pfn;
87 87
88 /* 88 /*
89 * in mm/vmscan.c: 89 * in mm/vmscan.c:
90 */ 90 */
91 extern int isolate_lru_page(struct page *page); 91 extern int isolate_lru_page(struct page *page);
92 extern void putback_lru_page(struct page *page); 92 extern void putback_lru_page(struct page *page);
93 93
94 /* 94 /*
95 * in mm/page_alloc.c 95 * in mm/page_alloc.c
96 */ 96 */
97 extern void __free_pages_bootmem(struct page *page, unsigned int order); 97 extern void __free_pages_bootmem(struct page *page, unsigned int order);
98 extern void prep_compound_page(struct page *page, unsigned long order); 98 extern void prep_compound_page(struct page *page, unsigned long order);
99 #ifdef CONFIG_MEMORY_FAILURE 99 #ifdef CONFIG_MEMORY_FAILURE
100 extern bool is_free_buddy_page(struct page *page); 100 extern bool is_free_buddy_page(struct page *page);
101 #endif 101 #endif
102 102
103 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 103 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
104 104
105 /* 105 /*
106 * in mm/compaction.c 106 * in mm/compaction.c
107 */ 107 */
108 /* 108 /*
109 * compact_control is used to track pages being migrated and the free pages 109 * compact_control is used to track pages being migrated and the free pages
110 * they are being migrated to during memory compaction. The free_pfn starts 110 * they are being migrated to during memory compaction. The free_pfn starts
111 * at the end of a zone and migrate_pfn begins at the start. Movable pages 111 * at the end of a zone and migrate_pfn begins at the start. Movable pages
112 * are moved to the end of a zone during a compaction run and the run 112 * are moved to the end of a zone during a compaction run and the run
113 * completes when free_pfn <= migrate_pfn 113 * completes when free_pfn <= migrate_pfn
114 */ 114 */
115 struct compact_control { 115 struct compact_control {
116 struct list_head freepages; /* List of free pages to migrate to */ 116 struct list_head freepages; /* List of free pages to migrate to */
117 struct list_head migratepages; /* List of pages being migrated */ 117 struct list_head migratepages; /* List of pages being migrated */
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 122 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
123 129
124 int order; /* order a direct compactor needs */ 130 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
126 struct zone *zone; 132 struct zone *zone;
127 }; 133 };
128 134
129 unsigned long 135 unsigned long
130 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); 136 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
131 unsigned long 137 unsigned long
132 isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 138 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
133 unsigned long low_pfn, unsigned long end_pfn); 139 unsigned long low_pfn, unsigned long end_pfn);
134 140
135 #endif 141 #endif
136 142
137 /* 143 /*
138 * function for dealing with page's order in buddy system. 144 * function for dealing with page's order in buddy system.
139 * zone->lock is already acquired when we use these. 145 * zone->lock is already acquired when we use these.
140 * So, we don't need atomic page->flags operations here. 146 * So, we don't need atomic page->flags operations here.
141 */ 147 */
142 static inline unsigned long page_order(struct page *page) 148 static inline unsigned long page_order(struct page *page)
143 { 149 {
144 /* PageBuddy() must be checked by the caller */ 150 /* PageBuddy() must be checked by the caller */
145 return page_private(page); 151 return page_private(page);
146 } 152 }
147 153
148 /* mm/util.c */ 154 /* mm/util.c */
149 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 155 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
150 struct vm_area_struct *prev, struct rb_node *rb_parent); 156 struct vm_area_struct *prev, struct rb_node *rb_parent);
151 157
152 #ifdef CONFIG_MMU 158 #ifdef CONFIG_MMU
153 extern long mlock_vma_pages_range(struct vm_area_struct *vma, 159 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
154 unsigned long start, unsigned long end); 160 unsigned long start, unsigned long end);
155 extern void munlock_vma_pages_range(struct vm_area_struct *vma, 161 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
156 unsigned long start, unsigned long end); 162 unsigned long start, unsigned long end);
157 static inline void munlock_vma_pages_all(struct vm_area_struct *vma) 163 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
158 { 164 {
159 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 165 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
160 } 166 }
161 167
162 /* 168 /*
163 * Called only in fault path via page_evictable() for a new page 169 * Called only in fault path via page_evictable() for a new page
164 * to determine if it's being mapped into a LOCKED vma. 170 * to determine if it's being mapped into a LOCKED vma.
165 * If so, mark page as mlocked. 171 * If so, mark page as mlocked.
166 */ 172 */
167 static inline int mlocked_vma_newpage(struct vm_area_struct *vma, 173 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
168 struct page *page) 174 struct page *page)
169 { 175 {
170 VM_BUG_ON(PageLRU(page)); 176 VM_BUG_ON(PageLRU(page));
171 177
172 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) 178 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
173 return 0; 179 return 0;
174 180
175 if (!TestSetPageMlocked(page)) { 181 if (!TestSetPageMlocked(page)) {
176 inc_zone_page_state(page, NR_MLOCK); 182 inc_zone_page_state(page, NR_MLOCK);
177 count_vm_event(UNEVICTABLE_PGMLOCKED); 183 count_vm_event(UNEVICTABLE_PGMLOCKED);
178 } 184 }
179 return 1; 185 return 1;
180 } 186 }
181 187
182 /* 188 /*
183 * must be called with vma's mmap_sem held for read or write, and page locked. 189 * must be called with vma's mmap_sem held for read or write, and page locked.
184 */ 190 */
185 extern void mlock_vma_page(struct page *page); 191 extern void mlock_vma_page(struct page *page);
186 extern void munlock_vma_page(struct page *page); 192 extern void munlock_vma_page(struct page *page);
187 193
188 /* 194 /*
189 * Clear the page's PageMlocked(). This can be useful in a situation where 195 * Clear the page's PageMlocked(). This can be useful in a situation where
190 * we want to unconditionally remove a page from the pagecache -- e.g., 196 * we want to unconditionally remove a page from the pagecache -- e.g.,
191 * on truncation or freeing. 197 * on truncation or freeing.
192 * 198 *
193 * It is legal to call this function for any page, mlocked or not. 199 * It is legal to call this function for any page, mlocked or not.
194 * If called for a page that is still mapped by mlocked vmas, all we do 200 * If called for a page that is still mapped by mlocked vmas, all we do
195 * is revert to lazy LRU behaviour -- semantics are not broken. 201 * is revert to lazy LRU behaviour -- semantics are not broken.
196 */ 202 */
197 extern void __clear_page_mlock(struct page *page); 203 extern void __clear_page_mlock(struct page *page);
198 static inline void clear_page_mlock(struct page *page) 204 static inline void clear_page_mlock(struct page *page)
199 { 205 {
200 if (unlikely(TestClearPageMlocked(page))) 206 if (unlikely(TestClearPageMlocked(page)))
201 __clear_page_mlock(page); 207 __clear_page_mlock(page);
202 } 208 }
203 209
204 /* 210 /*
205 * mlock_migrate_page - called only from migrate_page_copy() to 211 * mlock_migrate_page - called only from migrate_page_copy() to
206 * migrate the Mlocked page flag; update statistics. 212 * migrate the Mlocked page flag; update statistics.
207 */ 213 */
208 static inline void mlock_migrate_page(struct page *newpage, struct page *page) 214 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
209 { 215 {
210 if (TestClearPageMlocked(page)) { 216 if (TestClearPageMlocked(page)) {
211 unsigned long flags; 217 unsigned long flags;
212 218
213 local_irq_save(flags); 219 local_irq_save(flags);
214 __dec_zone_page_state(page, NR_MLOCK); 220 __dec_zone_page_state(page, NR_MLOCK);
215 SetPageMlocked(newpage); 221 SetPageMlocked(newpage);
216 __inc_zone_page_state(newpage, NR_MLOCK); 222 __inc_zone_page_state(newpage, NR_MLOCK);
217 local_irq_restore(flags); 223 local_irq_restore(flags);
218 } 224 }
219 } 225 }
220 226
221 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 227 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
222 extern unsigned long vma_address(struct page *page, 228 extern unsigned long vma_address(struct page *page,
223 struct vm_area_struct *vma); 229 struct vm_area_struct *vma);
224 #endif 230 #endif
225 #else /* !CONFIG_MMU */ 231 #else /* !CONFIG_MMU */
226 static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) 232 static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
227 { 233 {
228 return 0; 234 return 0;
229 } 235 }
230 static inline void clear_page_mlock(struct page *page) { } 236 static inline void clear_page_mlock(struct page *page) { }
231 static inline void mlock_vma_page(struct page *page) { } 237 static inline void mlock_vma_page(struct page *page) { }
232 static inline void mlock_migrate_page(struct page *new, struct page *old) { } 238 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
233 239
234 #endif /* !CONFIG_MMU */ 240 #endif /* !CONFIG_MMU */
235 241
236 /* 242 /*
237 * Return the mem_map entry representing the 'offset' subpage within 243 * Return the mem_map entry representing the 'offset' subpage within
238 * the maximally aligned gigantic page 'base'. Handle any discontiguity 244 * the maximally aligned gigantic page 'base'. Handle any discontiguity
239 * in the mem_map at MAX_ORDER_NR_PAGES boundaries. 245 * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
240 */ 246 */
241 static inline struct page *mem_map_offset(struct page *base, int offset) 247 static inline struct page *mem_map_offset(struct page *base, int offset)
242 { 248 {
243 if (unlikely(offset >= MAX_ORDER_NR_PAGES)) 249 if (unlikely(offset >= MAX_ORDER_NR_PAGES))
244 return pfn_to_page(page_to_pfn(base) + offset); 250 return pfn_to_page(page_to_pfn(base) + offset);
245 return base + offset; 251 return base + offset;
246 } 252 }
247 253
248 /* 254 /*
249 * Iterator over all subpages within the maximally aligned gigantic 255 * Iterator over all subpages within the maximally aligned gigantic
250 * page 'base'. Handle any discontiguity in the mem_map. 256 * page 'base'. Handle any discontiguity in the mem_map.
251 */ 257 */
252 static inline struct page *mem_map_next(struct page *iter, 258 static inline struct page *mem_map_next(struct page *iter,
253 struct page *base, int offset) 259 struct page *base, int offset)
254 { 260 {
255 if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { 261 if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
256 unsigned long pfn = page_to_pfn(base) + offset; 262 unsigned long pfn = page_to_pfn(base) + offset;
257 if (!pfn_valid(pfn)) 263 if (!pfn_valid(pfn))
258 return NULL; 264 return NULL;
259 return pfn_to_page(pfn); 265 return pfn_to_page(pfn);
260 } 266 }
261 return iter + 1; 267 return iter + 1;
262 } 268 }
263 269
264 /* 270 /*
265 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 271 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
266 * so all functions starting at paging_init should be marked __init 272 * so all functions starting at paging_init should be marked __init
267 * in those cases. SPARSEMEM, however, allows for memory hotplug, 273 * in those cases. SPARSEMEM, however, allows for memory hotplug,
268 * and alloc_bootmem_node is not used. 274 * and alloc_bootmem_node is not used.
269 */ 275 */
270 #ifdef CONFIG_SPARSEMEM 276 #ifdef CONFIG_SPARSEMEM
271 #define __paginginit __meminit 277 #define __paginginit __meminit
272 #else 278 #else
273 #define __paginginit __init 279 #define __paginginit __init
274 #endif 280 #endif
275 281
276 /* Memory initialisation debug and verification */ 282 /* Memory initialisation debug and verification */
277 enum mminit_level { 283 enum mminit_level {
278 MMINIT_WARNING, 284 MMINIT_WARNING,
279 MMINIT_VERIFY, 285 MMINIT_VERIFY,
280 MMINIT_TRACE 286 MMINIT_TRACE
281 }; 287 };
282 288
283 #ifdef CONFIG_DEBUG_MEMORY_INIT 289 #ifdef CONFIG_DEBUG_MEMORY_INIT
284 290
285 extern int mminit_loglevel; 291 extern int mminit_loglevel;
286 292
287 #define mminit_dprintk(level, prefix, fmt, arg...) \ 293 #define mminit_dprintk(level, prefix, fmt, arg...) \
288 do { \ 294 do { \
289 if (level < mminit_loglevel) { \ 295 if (level < mminit_loglevel) { \
290 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ 296 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
291 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ 297 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
292 } \ 298 } \
293 } while (0) 299 } while (0)
294 300
295 extern void mminit_verify_pageflags_layout(void); 301 extern void mminit_verify_pageflags_layout(void);
296 extern void mminit_verify_page_links(struct page *page, 302 extern void mminit_verify_page_links(struct page *page,
297 enum zone_type zone, unsigned long nid, unsigned long pfn); 303 enum zone_type zone, unsigned long nid, unsigned long pfn);
298 extern void mminit_verify_zonelist(void); 304 extern void mminit_verify_zonelist(void);
299 305
300 #else 306 #else
301 307
302 static inline void mminit_dprintk(enum mminit_level level, 308 static inline void mminit_dprintk(enum mminit_level level,
303 const char *prefix, const char *fmt, ...) 309 const char *prefix, const char *fmt, ...)
304 { 310 {
305 } 311 }
306 312
307 static inline void mminit_verify_pageflags_layout(void) 313 static inline void mminit_verify_pageflags_layout(void)
308 { 314 {
309 } 315 }
310 316
311 static inline void mminit_verify_page_links(struct page *page, 317 static inline void mminit_verify_page_links(struct page *page,
312 enum zone_type zone, unsigned long nid, unsigned long pfn) 318 enum zone_type zone, unsigned long nid, unsigned long pfn)
313 { 319 {
314 } 320 }
315 321
316 static inline void mminit_verify_zonelist(void) 322 static inline void mminit_verify_zonelist(void)
317 { 323 {
318 } 324 }
319 #endif /* CONFIG_DEBUG_MEMORY_INIT */ 325 #endif /* CONFIG_DEBUG_MEMORY_INIT */
320 326
321 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ 327 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
322 #if defined(CONFIG_SPARSEMEM) 328 #if defined(CONFIG_SPARSEMEM)
323 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, 329 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
324 unsigned long *end_pfn); 330 unsigned long *end_pfn);
325 #else 331 #else
326 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, 332 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
327 unsigned long *end_pfn) 333 unsigned long *end_pfn)
328 { 334 {
329 } 335 }
330 #endif /* CONFIG_SPARSEMEM */ 336 #endif /* CONFIG_SPARSEMEM */
331 337
332 #define ZONE_RECLAIM_NOSCAN -2 338 #define ZONE_RECLAIM_NOSCAN -2
333 #define ZONE_RECLAIM_FULL -1 339 #define ZONE_RECLAIM_FULL -1
334 #define ZONE_RECLAIM_SOME 0 340 #define ZONE_RECLAIM_SOME 0
335 #define ZONE_RECLAIM_SUCCESS 1 341 #define ZONE_RECLAIM_SUCCESS 1
336 #endif 342 #endif
337 343
338 extern int hwpoison_filter(struct page *p); 344 extern int hwpoison_filter(struct page *p);
339 345
340 extern u32 hwpoison_filter_dev_major; 346 extern u32 hwpoison_filter_dev_major;
341 extern u32 hwpoison_filter_dev_minor; 347 extern u32 hwpoison_filter_dev_minor;
342 extern u64 hwpoison_filter_flags_mask; 348 extern u64 hwpoison_filter_flags_mask;
343 extern u64 hwpoison_filter_flags_value; 349 extern u64 hwpoison_filter_flags_value;
344 extern u64 hwpoison_filter_memcg; 350 extern u64 hwpoison_filter_memcg;
345 extern u32 hwpoison_filter_enable; 351 extern u32 hwpoison_filter_enable;
346 352
347 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 353 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long, 354 unsigned long, unsigned long,
349 unsigned long, unsigned long); 355 unsigned long, unsigned long);
350 356
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/memory.h> 54 #include <linux/memory.h>
55 #include <linux/compaction.h> 55 #include <linux/compaction.h>
56 #include <trace/events/kmem.h> 56 #include <trace/events/kmem.h>
57 #include <linux/ftrace_event.h> 57 #include <linux/ftrace_event.h>
58 #include <linux/memcontrol.h> 58 #include <linux/memcontrol.h>
59 #include <linux/prefetch.h> 59 #include <linux/prefetch.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 62
63 #include <asm/tlbflush.h> 63 #include <asm/tlbflush.h>
64 #include <asm/div64.h> 64 #include <asm/div64.h>
65 #include "internal.h" 65 #include "internal.h"
66 66
67 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 67 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
68 DEFINE_PER_CPU(int, numa_node); 68 DEFINE_PER_CPU(int, numa_node);
69 EXPORT_PER_CPU_SYMBOL(numa_node); 69 EXPORT_PER_CPU_SYMBOL(numa_node);
70 #endif 70 #endif
71 71
72 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 72 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
73 /* 73 /*
74 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 74 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
75 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 75 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
76 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 76 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
77 * defined in <linux/topology.h>. 77 * defined in <linux/topology.h>.
78 */ 78 */
79 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 79 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
80 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 80 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
81 #endif 81 #endif
82 82
83 /* 83 /*
84 * Array of node states. 84 * Array of node states.
85 */ 85 */
86 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 86 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
87 [N_POSSIBLE] = NODE_MASK_ALL, 87 [N_POSSIBLE] = NODE_MASK_ALL,
88 [N_ONLINE] = { { [0] = 1UL } }, 88 [N_ONLINE] = { { [0] = 1UL } },
89 #ifndef CONFIG_NUMA 89 #ifndef CONFIG_NUMA
90 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 90 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
91 #ifdef CONFIG_HIGHMEM 91 #ifdef CONFIG_HIGHMEM
92 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 92 [N_HIGH_MEMORY] = { { [0] = 1UL } },
93 #endif 93 #endif
94 [N_CPU] = { { [0] = 1UL } }, 94 [N_CPU] = { { [0] = 1UL } },
95 #endif /* NUMA */ 95 #endif /* NUMA */
96 }; 96 };
97 EXPORT_SYMBOL(node_states); 97 EXPORT_SYMBOL(node_states);
98 98
99 unsigned long totalram_pages __read_mostly; 99 unsigned long totalram_pages __read_mostly;
100 unsigned long totalreserve_pages __read_mostly; 100 unsigned long totalreserve_pages __read_mostly;
101 /* 101 /*
102 * When calculating the number of globally allowed dirty pages, there 102 * When calculating the number of globally allowed dirty pages, there
103 * is a certain number of per-zone reserves that should not be 103 * is a certain number of per-zone reserves that should not be
104 * considered dirtyable memory. This is the sum of those reserves 104 * considered dirtyable memory. This is the sum of those reserves
105 * over all existing zones that contribute dirtyable memory. 105 * over all existing zones that contribute dirtyable memory.
106 */ 106 */
107 unsigned long dirty_balance_reserve __read_mostly; 107 unsigned long dirty_balance_reserve __read_mostly;
108 108
109 int percpu_pagelist_fraction; 109 int percpu_pagelist_fraction;
110 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 110 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
111 111
112 #ifdef CONFIG_PM_SLEEP 112 #ifdef CONFIG_PM_SLEEP
113 /* 113 /*
114 * The following functions are used by the suspend/hibernate code to temporarily 114 * The following functions are used by the suspend/hibernate code to temporarily
115 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 115 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
116 * while devices are suspended. To avoid races with the suspend/hibernate code, 116 * while devices are suspended. To avoid races with the suspend/hibernate code,
117 * they should always be called with pm_mutex held (gfp_allowed_mask also should 117 * they should always be called with pm_mutex held (gfp_allowed_mask also should
118 * only be modified with pm_mutex held, unless the suspend/hibernate code is 118 * only be modified with pm_mutex held, unless the suspend/hibernate code is
119 * guaranteed not to run in parallel with that modification). 119 * guaranteed not to run in parallel with that modification).
120 */ 120 */
121 121
122 static gfp_t saved_gfp_mask; 122 static gfp_t saved_gfp_mask;
123 123
124 void pm_restore_gfp_mask(void) 124 void pm_restore_gfp_mask(void)
125 { 125 {
126 WARN_ON(!mutex_is_locked(&pm_mutex)); 126 WARN_ON(!mutex_is_locked(&pm_mutex));
127 if (saved_gfp_mask) { 127 if (saved_gfp_mask) {
128 gfp_allowed_mask = saved_gfp_mask; 128 gfp_allowed_mask = saved_gfp_mask;
129 saved_gfp_mask = 0; 129 saved_gfp_mask = 0;
130 } 130 }
131 } 131 }
132 132
133 void pm_restrict_gfp_mask(void) 133 void pm_restrict_gfp_mask(void)
134 { 134 {
135 WARN_ON(!mutex_is_locked(&pm_mutex)); 135 WARN_ON(!mutex_is_locked(&pm_mutex));
136 WARN_ON(saved_gfp_mask); 136 WARN_ON(saved_gfp_mask);
137 saved_gfp_mask = gfp_allowed_mask; 137 saved_gfp_mask = gfp_allowed_mask;
138 gfp_allowed_mask &= ~GFP_IOFS; 138 gfp_allowed_mask &= ~GFP_IOFS;
139 } 139 }
140 140
141 bool pm_suspended_storage(void) 141 bool pm_suspended_storage(void)
142 { 142 {
143 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 143 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
144 return false; 144 return false;
145 return true; 145 return true;
146 } 146 }
147 #endif /* CONFIG_PM_SLEEP */ 147 #endif /* CONFIG_PM_SLEEP */
148 148
149 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 149 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
150 int pageblock_order __read_mostly; 150 int pageblock_order __read_mostly;
151 #endif 151 #endif
152 152
153 static void __free_pages_ok(struct page *page, unsigned int order); 153 static void __free_pages_ok(struct page *page, unsigned int order);
154 154
155 /* 155 /*
156 * results with 256, 32 in the lowmem_reserve sysctl: 156 * results with 256, 32 in the lowmem_reserve sysctl:
157 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 157 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
158 * 1G machine -> (16M dma, 784M normal, 224M high) 158 * 1G machine -> (16M dma, 784M normal, 224M high)
159 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 159 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
160 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 160 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
161 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 161 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
162 * 162 *
163 * TBD: should special case ZONE_DMA32 machines here - in those we normally 163 * TBD: should special case ZONE_DMA32 machines here - in those we normally
164 * don't need any ZONE_NORMAL reservation 164 * don't need any ZONE_NORMAL reservation
165 */ 165 */
166 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 166 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
167 #ifdef CONFIG_ZONE_DMA 167 #ifdef CONFIG_ZONE_DMA
168 256, 168 256,
169 #endif 169 #endif
170 #ifdef CONFIG_ZONE_DMA32 170 #ifdef CONFIG_ZONE_DMA32
171 256, 171 256,
172 #endif 172 #endif
173 #ifdef CONFIG_HIGHMEM 173 #ifdef CONFIG_HIGHMEM
174 32, 174 32,
175 #endif 175 #endif
176 32, 176 32,
177 }; 177 };
178 178
179 EXPORT_SYMBOL(totalram_pages); 179 EXPORT_SYMBOL(totalram_pages);
180 180
181 static char * const zone_names[MAX_NR_ZONES] = { 181 static char * const zone_names[MAX_NR_ZONES] = {
182 #ifdef CONFIG_ZONE_DMA 182 #ifdef CONFIG_ZONE_DMA
183 "DMA", 183 "DMA",
184 #endif 184 #endif
185 #ifdef CONFIG_ZONE_DMA32 185 #ifdef CONFIG_ZONE_DMA32
186 "DMA32", 186 "DMA32",
187 #endif 187 #endif
188 "Normal", 188 "Normal",
189 #ifdef CONFIG_HIGHMEM 189 #ifdef CONFIG_HIGHMEM
190 "HighMem", 190 "HighMem",
191 #endif 191 #endif
192 "Movable", 192 "Movable",
193 }; 193 };
194 194
195 int min_free_kbytes = 1024; 195 int min_free_kbytes = 1024;
196 196
197 static unsigned long __meminitdata nr_kernel_pages; 197 static unsigned long __meminitdata nr_kernel_pages;
198 static unsigned long __meminitdata nr_all_pages; 198 static unsigned long __meminitdata nr_all_pages;
199 static unsigned long __meminitdata dma_reserve; 199 static unsigned long __meminitdata dma_reserve;
200 200
201 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 201 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
202 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 202 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
203 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 203 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
204 static unsigned long __initdata required_kernelcore; 204 static unsigned long __initdata required_kernelcore;
205 static unsigned long __initdata required_movablecore; 205 static unsigned long __initdata required_movablecore;
206 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 206 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
207 207
208 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 208 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
209 int movable_zone; 209 int movable_zone;
210 EXPORT_SYMBOL(movable_zone); 210 EXPORT_SYMBOL(movable_zone);
211 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 211 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
212 212
213 #if MAX_NUMNODES > 1 213 #if MAX_NUMNODES > 1
214 int nr_node_ids __read_mostly = MAX_NUMNODES; 214 int nr_node_ids __read_mostly = MAX_NUMNODES;
215 int nr_online_nodes __read_mostly = 1; 215 int nr_online_nodes __read_mostly = 1;
216 EXPORT_SYMBOL(nr_node_ids); 216 EXPORT_SYMBOL(nr_node_ids);
217 EXPORT_SYMBOL(nr_online_nodes); 217 EXPORT_SYMBOL(nr_online_nodes);
218 #endif 218 #endif
219 219
220 int page_group_by_mobility_disabled __read_mostly; 220 int page_group_by_mobility_disabled __read_mostly;
221 221
222 static void set_pageblock_migratetype(struct page *page, int migratetype) 222 static void set_pageblock_migratetype(struct page *page, int migratetype)
223 { 223 {
224 224
225 if (unlikely(page_group_by_mobility_disabled)) 225 if (unlikely(page_group_by_mobility_disabled))
226 migratetype = MIGRATE_UNMOVABLE; 226 migratetype = MIGRATE_UNMOVABLE;
227 227
228 set_pageblock_flags_group(page, (unsigned long)migratetype, 228 set_pageblock_flags_group(page, (unsigned long)migratetype,
229 PB_migrate, PB_migrate_end); 229 PB_migrate, PB_migrate_end);
230 } 230 }
231 231
232 bool oom_killer_disabled __read_mostly; 232 bool oom_killer_disabled __read_mostly;
233 233
234 #ifdef CONFIG_DEBUG_VM 234 #ifdef CONFIG_DEBUG_VM
235 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 235 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
236 { 236 {
237 int ret = 0; 237 int ret = 0;
238 unsigned seq; 238 unsigned seq;
239 unsigned long pfn = page_to_pfn(page); 239 unsigned long pfn = page_to_pfn(page);
240 240
241 do { 241 do {
242 seq = zone_span_seqbegin(zone); 242 seq = zone_span_seqbegin(zone);
243 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 243 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
244 ret = 1; 244 ret = 1;
245 else if (pfn < zone->zone_start_pfn) 245 else if (pfn < zone->zone_start_pfn)
246 ret = 1; 246 ret = 1;
247 } while (zone_span_seqretry(zone, seq)); 247 } while (zone_span_seqretry(zone, seq));
248 248
249 return ret; 249 return ret;
250 } 250 }
251 251
252 static int page_is_consistent(struct zone *zone, struct page *page) 252 static int page_is_consistent(struct zone *zone, struct page *page)
253 { 253 {
254 if (!pfn_valid_within(page_to_pfn(page))) 254 if (!pfn_valid_within(page_to_pfn(page)))
255 return 0; 255 return 0;
256 if (zone != page_zone(page)) 256 if (zone != page_zone(page))
257 return 0; 257 return 0;
258 258
259 return 1; 259 return 1;
260 } 260 }
261 /* 261 /*
262 * Temporary debugging check for pages not lying within a given zone. 262 * Temporary debugging check for pages not lying within a given zone.
263 */ 263 */
264 static int bad_range(struct zone *zone, struct page *page) 264 static int bad_range(struct zone *zone, struct page *page)
265 { 265 {
266 if (page_outside_zone_boundaries(zone, page)) 266 if (page_outside_zone_boundaries(zone, page))
267 return 1; 267 return 1;
268 if (!page_is_consistent(zone, page)) 268 if (!page_is_consistent(zone, page))
269 return 1; 269 return 1;
270 270
271 return 0; 271 return 0;
272 } 272 }
273 #else 273 #else
274 static inline int bad_range(struct zone *zone, struct page *page) 274 static inline int bad_range(struct zone *zone, struct page *page)
275 { 275 {
276 return 0; 276 return 0;
277 } 277 }
278 #endif 278 #endif
279 279
280 static void bad_page(struct page *page) 280 static void bad_page(struct page *page)
281 { 281 {
282 static unsigned long resume; 282 static unsigned long resume;
283 static unsigned long nr_shown; 283 static unsigned long nr_shown;
284 static unsigned long nr_unshown; 284 static unsigned long nr_unshown;
285 285
286 /* Don't complain about poisoned pages */ 286 /* Don't complain about poisoned pages */
287 if (PageHWPoison(page)) { 287 if (PageHWPoison(page)) {
288 reset_page_mapcount(page); /* remove PageBuddy */ 288 reset_page_mapcount(page); /* remove PageBuddy */
289 return; 289 return;
290 } 290 }
291 291
292 /* 292 /*
293 * Allow a burst of 60 reports, then keep quiet for that minute; 293 * Allow a burst of 60 reports, then keep quiet for that minute;
294 * or allow a steady drip of one report per second. 294 * or allow a steady drip of one report per second.
295 */ 295 */
296 if (nr_shown == 60) { 296 if (nr_shown == 60) {
297 if (time_before(jiffies, resume)) { 297 if (time_before(jiffies, resume)) {
298 nr_unshown++; 298 nr_unshown++;
299 goto out; 299 goto out;
300 } 300 }
301 if (nr_unshown) { 301 if (nr_unshown) {
302 printk(KERN_ALERT 302 printk(KERN_ALERT
303 "BUG: Bad page state: %lu messages suppressed\n", 303 "BUG: Bad page state: %lu messages suppressed\n",
304 nr_unshown); 304 nr_unshown);
305 nr_unshown = 0; 305 nr_unshown = 0;
306 } 306 }
307 nr_shown = 0; 307 nr_shown = 0;
308 } 308 }
309 if (nr_shown++ == 0) 309 if (nr_shown++ == 0)
310 resume = jiffies + 60 * HZ; 310 resume = jiffies + 60 * HZ;
311 311
312 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 312 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
313 current->comm, page_to_pfn(page)); 313 current->comm, page_to_pfn(page));
314 dump_page(page); 314 dump_page(page);
315 315
316 print_modules(); 316 print_modules();
317 dump_stack(); 317 dump_stack();
318 out: 318 out:
319 /* Leave bad fields for debug, except PageBuddy could make trouble */ 319 /* Leave bad fields for debug, except PageBuddy could make trouble */
320 reset_page_mapcount(page); /* remove PageBuddy */ 320 reset_page_mapcount(page); /* remove PageBuddy */
321 add_taint(TAINT_BAD_PAGE); 321 add_taint(TAINT_BAD_PAGE);
322 } 322 }
323 323
324 /* 324 /*
325 * Higher-order pages are called "compound pages". They are structured thusly: 325 * Higher-order pages are called "compound pages". They are structured thusly:
326 * 326 *
327 * The first PAGE_SIZE page is called the "head page". 327 * The first PAGE_SIZE page is called the "head page".
328 * 328 *
329 * The remaining PAGE_SIZE pages are called "tail pages". 329 * The remaining PAGE_SIZE pages are called "tail pages".
330 * 330 *
331 * All pages have PG_compound set. All tail pages have their ->first_page 331 * All pages have PG_compound set. All tail pages have their ->first_page
332 * pointing at the head page. 332 * pointing at the head page.
333 * 333 *
334 * The first tail page's ->lru.next holds the address of the compound page's 334 * The first tail page's ->lru.next holds the address of the compound page's
335 * put_page() function. Its ->lru.prev holds the order of allocation. 335 * put_page() function. Its ->lru.prev holds the order of allocation.
336 * This usage means that zero-order pages may not be compound. 336 * This usage means that zero-order pages may not be compound.
337 */ 337 */
338 338
339 static void free_compound_page(struct page *page) 339 static void free_compound_page(struct page *page)
340 { 340 {
341 __free_pages_ok(page, compound_order(page)); 341 __free_pages_ok(page, compound_order(page));
342 } 342 }
343 343
344 void prep_compound_page(struct page *page, unsigned long order) 344 void prep_compound_page(struct page *page, unsigned long order)
345 { 345 {
346 int i; 346 int i;
347 int nr_pages = 1 << order; 347 int nr_pages = 1 << order;
348 348
349 set_compound_page_dtor(page, free_compound_page); 349 set_compound_page_dtor(page, free_compound_page);
350 set_compound_order(page, order); 350 set_compound_order(page, order);
351 __SetPageHead(page); 351 __SetPageHead(page);
352 for (i = 1; i < nr_pages; i++) { 352 for (i = 1; i < nr_pages; i++) {
353 struct page *p = page + i; 353 struct page *p = page + i;
354 __SetPageTail(p); 354 __SetPageTail(p);
355 set_page_count(p, 0); 355 set_page_count(p, 0);
356 p->first_page = page; 356 p->first_page = page;
357 } 357 }
358 } 358 }
359 359
360 /* update __split_huge_page_refcount if you change this function */ 360 /* update __split_huge_page_refcount if you change this function */
361 static int destroy_compound_page(struct page *page, unsigned long order) 361 static int destroy_compound_page(struct page *page, unsigned long order)
362 { 362 {
363 int i; 363 int i;
364 int nr_pages = 1 << order; 364 int nr_pages = 1 << order;
365 int bad = 0; 365 int bad = 0;
366 366
367 if (unlikely(compound_order(page) != order) || 367 if (unlikely(compound_order(page) != order) ||
368 unlikely(!PageHead(page))) { 368 unlikely(!PageHead(page))) {
369 bad_page(page); 369 bad_page(page);
370 bad++; 370 bad++;
371 } 371 }
372 372
373 __ClearPageHead(page); 373 __ClearPageHead(page);
374 374
375 for (i = 1; i < nr_pages; i++) { 375 for (i = 1; i < nr_pages; i++) {
376 struct page *p = page + i; 376 struct page *p = page + i;
377 377
378 if (unlikely(!PageTail(p) || (p->first_page != page))) { 378 if (unlikely(!PageTail(p) || (p->first_page != page))) {
379 bad_page(page); 379 bad_page(page);
380 bad++; 380 bad++;
381 } 381 }
382 __ClearPageTail(p); 382 __ClearPageTail(p);
383 } 383 }
384 384
385 return bad; 385 return bad;
386 } 386 }
387 387
388 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 388 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
389 { 389 {
390 int i; 390 int i;
391 391
392 /* 392 /*
393 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 393 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
394 * and __GFP_HIGHMEM from hard or soft interrupt context. 394 * and __GFP_HIGHMEM from hard or soft interrupt context.
395 */ 395 */
396 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 396 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
397 for (i = 0; i < (1 << order); i++) 397 for (i = 0; i < (1 << order); i++)
398 clear_highpage(page + i); 398 clear_highpage(page + i);
399 } 399 }
400 400
401 #ifdef CONFIG_DEBUG_PAGEALLOC 401 #ifdef CONFIG_DEBUG_PAGEALLOC
402 unsigned int _debug_guardpage_minorder; 402 unsigned int _debug_guardpage_minorder;
403 403
404 static int __init debug_guardpage_minorder_setup(char *buf) 404 static int __init debug_guardpage_minorder_setup(char *buf)
405 { 405 {
406 unsigned long res; 406 unsigned long res;
407 407
408 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 408 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
409 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 409 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
410 return 0; 410 return 0;
411 } 411 }
412 _debug_guardpage_minorder = res; 412 _debug_guardpage_minorder = res;
413 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 413 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
414 return 0; 414 return 0;
415 } 415 }
416 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 416 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
417 417
418 static inline void set_page_guard_flag(struct page *page) 418 static inline void set_page_guard_flag(struct page *page)
419 { 419 {
420 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 420 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
421 } 421 }
422 422
423 static inline void clear_page_guard_flag(struct page *page) 423 static inline void clear_page_guard_flag(struct page *page)
424 { 424 {
425 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 425 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
426 } 426 }
427 #else 427 #else
428 static inline void set_page_guard_flag(struct page *page) { } 428 static inline void set_page_guard_flag(struct page *page) { }
429 static inline void clear_page_guard_flag(struct page *page) { } 429 static inline void clear_page_guard_flag(struct page *page) { }
430 #endif 430 #endif
431 431
432 static inline void set_page_order(struct page *page, int order) 432 static inline void set_page_order(struct page *page, int order)
433 { 433 {
434 set_page_private(page, order); 434 set_page_private(page, order);
435 __SetPageBuddy(page); 435 __SetPageBuddy(page);
436 } 436 }
437 437
438 static inline void rmv_page_order(struct page *page) 438 static inline void rmv_page_order(struct page *page)
439 { 439 {
440 __ClearPageBuddy(page); 440 __ClearPageBuddy(page);
441 set_page_private(page, 0); 441 set_page_private(page, 0);
442 } 442 }
443 443
444 /* 444 /*
445 * Locate the struct page for both the matching buddy in our 445 * Locate the struct page for both the matching buddy in our
446 * pair (buddy1) and the combined O(n+1) page they form (page). 446 * pair (buddy1) and the combined O(n+1) page they form (page).
447 * 447 *
448 * 1) Any buddy B1 will have an order O twin B2 which satisfies 448 * 1) Any buddy B1 will have an order O twin B2 which satisfies
449 * the following equation: 449 * the following equation:
450 * B2 = B1 ^ (1 << O) 450 * B2 = B1 ^ (1 << O)
451 * For example, if the starting buddy (buddy2) is #8 its order 451 * For example, if the starting buddy (buddy2) is #8 its order
452 * 1 buddy is #10: 452 * 1 buddy is #10:
453 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 453 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
454 * 454 *
455 * 2) Any buddy B will have an order O+1 parent P which 455 * 2) Any buddy B will have an order O+1 parent P which
456 * satisfies the following equation: 456 * satisfies the following equation:
457 * P = B & ~(1 << O) 457 * P = B & ~(1 << O)
458 * 458 *
459 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 459 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
460 */ 460 */
461 static inline unsigned long 461 static inline unsigned long
462 __find_buddy_index(unsigned long page_idx, unsigned int order) 462 __find_buddy_index(unsigned long page_idx, unsigned int order)
463 { 463 {
464 return page_idx ^ (1 << order); 464 return page_idx ^ (1 << order);
465 } 465 }
466 466
467 /* 467 /*
468 * This function checks whether a page is free && is the buddy 468 * This function checks whether a page is free && is the buddy
469 * we can do coalesce a page and its buddy if 469 * we can do coalesce a page and its buddy if
470 * (a) the buddy is not in a hole && 470 * (a) the buddy is not in a hole &&
471 * (b) the buddy is in the buddy system && 471 * (b) the buddy is in the buddy system &&
472 * (c) a page and its buddy have the same order && 472 * (c) a page and its buddy have the same order &&
473 * (d) a page and its buddy are in the same zone. 473 * (d) a page and its buddy are in the same zone.
474 * 474 *
475 * For recording whether a page is in the buddy system, we set ->_mapcount -2. 475 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
476 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. 476 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
477 * 477 *
478 * For recording page's order, we use page_private(page). 478 * For recording page's order, we use page_private(page).
479 */ 479 */
480 static inline int page_is_buddy(struct page *page, struct page *buddy, 480 static inline int page_is_buddy(struct page *page, struct page *buddy,
481 int order) 481 int order)
482 { 482 {
483 if (!pfn_valid_within(page_to_pfn(buddy))) 483 if (!pfn_valid_within(page_to_pfn(buddy)))
484 return 0; 484 return 0;
485 485
486 if (page_zone_id(page) != page_zone_id(buddy)) 486 if (page_zone_id(page) != page_zone_id(buddy))
487 return 0; 487 return 0;
488 488
489 if (page_is_guard(buddy) && page_order(buddy) == order) { 489 if (page_is_guard(buddy) && page_order(buddy) == order) {
490 VM_BUG_ON(page_count(buddy) != 0); 490 VM_BUG_ON(page_count(buddy) != 0);
491 return 1; 491 return 1;
492 } 492 }
493 493
494 if (PageBuddy(buddy) && page_order(buddy) == order) { 494 if (PageBuddy(buddy) && page_order(buddy) == order) {
495 VM_BUG_ON(page_count(buddy) != 0); 495 VM_BUG_ON(page_count(buddy) != 0);
496 return 1; 496 return 1;
497 } 497 }
498 return 0; 498 return 0;
499 } 499 }
500 500
501 /* 501 /*
502 * Freeing function for a buddy system allocator. 502 * Freeing function for a buddy system allocator.
503 * 503 *
504 * The concept of a buddy system is to maintain direct-mapped table 504 * The concept of a buddy system is to maintain direct-mapped table
505 * (containing bit values) for memory blocks of various "orders". 505 * (containing bit values) for memory blocks of various "orders".
506 * The bottom level table contains the map for the smallest allocatable 506 * The bottom level table contains the map for the smallest allocatable
507 * units of memory (here, pages), and each level above it describes 507 * units of memory (here, pages), and each level above it describes
508 * pairs of units from the levels below, hence, "buddies". 508 * pairs of units from the levels below, hence, "buddies".
509 * At a high level, all that happens here is marking the table entry 509 * At a high level, all that happens here is marking the table entry
510 * at the bottom level available, and propagating the changes upward 510 * at the bottom level available, and propagating the changes upward
511 * as necessary, plus some accounting needed to play nicely with other 511 * as necessary, plus some accounting needed to play nicely with other
512 * parts of the VM system. 512 * parts of the VM system.
513 * At each level, we keep a list of pages, which are heads of continuous 513 * At each level, we keep a list of pages, which are heads of continuous
514 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 514 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
515 * order is recorded in page_private(page) field. 515 * order is recorded in page_private(page) field.
516 * So when we are allocating or freeing one, we can derive the state of the 516 * So when we are allocating or freeing one, we can derive the state of the
517 * other. That is, if we allocate a small block, and both were 517 * other. That is, if we allocate a small block, and both were
518 * free, the remainder of the region must be split into blocks. 518 * free, the remainder of the region must be split into blocks.
519 * If a block is freed, and its buddy is also free, then this 519 * If a block is freed, and its buddy is also free, then this
520 * triggers coalescing into a block of larger size. 520 * triggers coalescing into a block of larger size.
521 * 521 *
522 * -- wli 522 * -- wli
523 */ 523 */
524 524
525 static inline void __free_one_page(struct page *page, 525 static inline void __free_one_page(struct page *page,
526 struct zone *zone, unsigned int order, 526 struct zone *zone, unsigned int order,
527 int migratetype) 527 int migratetype)
528 { 528 {
529 unsigned long page_idx; 529 unsigned long page_idx;
530 unsigned long combined_idx; 530 unsigned long combined_idx;
531 unsigned long uninitialized_var(buddy_idx); 531 unsigned long uninitialized_var(buddy_idx);
532 struct page *buddy; 532 struct page *buddy;
533 533
534 if (unlikely(PageCompound(page))) 534 if (unlikely(PageCompound(page)))
535 if (unlikely(destroy_compound_page(page, order))) 535 if (unlikely(destroy_compound_page(page, order)))
536 return; 536 return;
537 537
538 VM_BUG_ON(migratetype == -1); 538 VM_BUG_ON(migratetype == -1);
539 539
540 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 540 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
541 541
542 VM_BUG_ON(page_idx & ((1 << order) - 1)); 542 VM_BUG_ON(page_idx & ((1 << order) - 1));
543 VM_BUG_ON(bad_range(zone, page)); 543 VM_BUG_ON(bad_range(zone, page));
544 544
545 while (order < MAX_ORDER-1) { 545 while (order < MAX_ORDER-1) {
546 buddy_idx = __find_buddy_index(page_idx, order); 546 buddy_idx = __find_buddy_index(page_idx, order);
547 buddy = page + (buddy_idx - page_idx); 547 buddy = page + (buddy_idx - page_idx);
548 if (!page_is_buddy(page, buddy, order)) 548 if (!page_is_buddy(page, buddy, order))
549 break; 549 break;
550 /* 550 /*
551 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 551 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
552 * merge with it and move up one order. 552 * merge with it and move up one order.
553 */ 553 */
554 if (page_is_guard(buddy)) { 554 if (page_is_guard(buddy)) {
555 clear_page_guard_flag(buddy); 555 clear_page_guard_flag(buddy);
556 set_page_private(page, 0); 556 set_page_private(page, 0);
557 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 557 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
558 } else { 558 } else {
559 list_del(&buddy->lru); 559 list_del(&buddy->lru);
560 zone->free_area[order].nr_free--; 560 zone->free_area[order].nr_free--;
561 rmv_page_order(buddy); 561 rmv_page_order(buddy);
562 } 562 }
563 combined_idx = buddy_idx & page_idx; 563 combined_idx = buddy_idx & page_idx;
564 page = page + (combined_idx - page_idx); 564 page = page + (combined_idx - page_idx);
565 page_idx = combined_idx; 565 page_idx = combined_idx;
566 order++; 566 order++;
567 } 567 }
568 set_page_order(page, order); 568 set_page_order(page, order);
569 569
570 /* 570 /*
571 * If this is not the largest possible page, check if the buddy 571 * If this is not the largest possible page, check if the buddy
572 * of the next-highest order is free. If it is, it's possible 572 * of the next-highest order is free. If it is, it's possible
573 * that pages are being freed that will coalesce soon. In case, 573 * that pages are being freed that will coalesce soon. In case,
574 * that is happening, add the free page to the tail of the list 574 * that is happening, add the free page to the tail of the list
575 * so it's less likely to be used soon and more likely to be merged 575 * so it's less likely to be used soon and more likely to be merged
576 * as a higher order page 576 * as a higher order page
577 */ 577 */
578 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 578 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
579 struct page *higher_page, *higher_buddy; 579 struct page *higher_page, *higher_buddy;
580 combined_idx = buddy_idx & page_idx; 580 combined_idx = buddy_idx & page_idx;
581 higher_page = page + (combined_idx - page_idx); 581 higher_page = page + (combined_idx - page_idx);
582 buddy_idx = __find_buddy_index(combined_idx, order + 1); 582 buddy_idx = __find_buddy_index(combined_idx, order + 1);
583 higher_buddy = page + (buddy_idx - combined_idx); 583 higher_buddy = page + (buddy_idx - combined_idx);
584 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 584 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
585 list_add_tail(&page->lru, 585 list_add_tail(&page->lru,
586 &zone->free_area[order].free_list[migratetype]); 586 &zone->free_area[order].free_list[migratetype]);
587 goto out; 587 goto out;
588 } 588 }
589 } 589 }
590 590
591 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 591 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
592 out: 592 out:
593 zone->free_area[order].nr_free++; 593 zone->free_area[order].nr_free++;
594 } 594 }
595 595
596 /* 596 /*
597 * free_page_mlock() -- clean up attempts to free and mlocked() page. 597 * free_page_mlock() -- clean up attempts to free and mlocked() page.
598 * Page should not be on lru, so no need to fix that up. 598 * Page should not be on lru, so no need to fix that up.
599 * free_pages_check() will verify... 599 * free_pages_check() will verify...
600 */ 600 */
601 static inline void free_page_mlock(struct page *page) 601 static inline void free_page_mlock(struct page *page)
602 { 602 {
603 __dec_zone_page_state(page, NR_MLOCK); 603 __dec_zone_page_state(page, NR_MLOCK);
604 __count_vm_event(UNEVICTABLE_MLOCKFREED); 604 __count_vm_event(UNEVICTABLE_MLOCKFREED);
605 } 605 }
606 606
607 static inline int free_pages_check(struct page *page) 607 static inline int free_pages_check(struct page *page)
608 { 608 {
609 if (unlikely(page_mapcount(page) | 609 if (unlikely(page_mapcount(page) |
610 (page->mapping != NULL) | 610 (page->mapping != NULL) |
611 (atomic_read(&page->_count) != 0) | 611 (atomic_read(&page->_count) != 0) |
612 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 612 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
613 (mem_cgroup_bad_page_check(page)))) { 613 (mem_cgroup_bad_page_check(page)))) {
614 bad_page(page); 614 bad_page(page);
615 return 1; 615 return 1;
616 } 616 }
617 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 617 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
618 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 618 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
619 return 0; 619 return 0;
620 } 620 }
621 621
622 /* 622 /*
623 * Frees a number of pages from the PCP lists 623 * Frees a number of pages from the PCP lists
624 * Assumes all pages on list are in same zone, and of same order. 624 * Assumes all pages on list are in same zone, and of same order.
625 * count is the number of pages to free. 625 * count is the number of pages to free.
626 * 626 *
627 * If the zone was previously in an "all pages pinned" state then look to 627 * If the zone was previously in an "all pages pinned" state then look to
628 * see if this freeing clears that state. 628 * see if this freeing clears that state.
629 * 629 *
630 * And clear the zone's pages_scanned counter, to hold off the "all pages are 630 * And clear the zone's pages_scanned counter, to hold off the "all pages are
631 * pinned" detection logic. 631 * pinned" detection logic.
632 */ 632 */
633 static void free_pcppages_bulk(struct zone *zone, int count, 633 static void free_pcppages_bulk(struct zone *zone, int count,
634 struct per_cpu_pages *pcp) 634 struct per_cpu_pages *pcp)
635 { 635 {
636 int migratetype = 0; 636 int migratetype = 0;
637 int batch_free = 0; 637 int batch_free = 0;
638 int to_free = count; 638 int to_free = count;
639 639
640 spin_lock(&zone->lock); 640 spin_lock(&zone->lock);
641 zone->all_unreclaimable = 0; 641 zone->all_unreclaimable = 0;
642 zone->pages_scanned = 0; 642 zone->pages_scanned = 0;
643 643
644 while (to_free) { 644 while (to_free) {
645 struct page *page; 645 struct page *page;
646 struct list_head *list; 646 struct list_head *list;
647 647
648 /* 648 /*
649 * Remove pages from lists in a round-robin fashion. A 649 * Remove pages from lists in a round-robin fashion. A
650 * batch_free count is maintained that is incremented when an 650 * batch_free count is maintained that is incremented when an
651 * empty list is encountered. This is so more pages are freed 651 * empty list is encountered. This is so more pages are freed
652 * off fuller lists instead of spinning excessively around empty 652 * off fuller lists instead of spinning excessively around empty
653 * lists 653 * lists
654 */ 654 */
655 do { 655 do {
656 batch_free++; 656 batch_free++;
657 if (++migratetype == MIGRATE_PCPTYPES) 657 if (++migratetype == MIGRATE_PCPTYPES)
658 migratetype = 0; 658 migratetype = 0;
659 list = &pcp->lists[migratetype]; 659 list = &pcp->lists[migratetype];
660 } while (list_empty(list)); 660 } while (list_empty(list));
661 661
662 /* This is the only non-empty list. Free them all. */ 662 /* This is the only non-empty list. Free them all. */
663 if (batch_free == MIGRATE_PCPTYPES) 663 if (batch_free == MIGRATE_PCPTYPES)
664 batch_free = to_free; 664 batch_free = to_free;
665 665
666 do { 666 do {
667 page = list_entry(list->prev, struct page, lru); 667 page = list_entry(list->prev, struct page, lru);
668 /* must delete as __free_one_page list manipulates */ 668 /* must delete as __free_one_page list manipulates */
669 list_del(&page->lru); 669 list_del(&page->lru);
670 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 670 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
671 __free_one_page(page, zone, 0, page_private(page)); 671 __free_one_page(page, zone, 0, page_private(page));
672 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 672 trace_mm_page_pcpu_drain(page, 0, page_private(page));
673 } while (--to_free && --batch_free && !list_empty(list)); 673 } while (--to_free && --batch_free && !list_empty(list));
674 } 674 }
675 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 675 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
676 spin_unlock(&zone->lock); 676 spin_unlock(&zone->lock);
677 } 677 }
678 678
679 static void free_one_page(struct zone *zone, struct page *page, int order, 679 static void free_one_page(struct zone *zone, struct page *page, int order,
680 int migratetype) 680 int migratetype)
681 { 681 {
682 spin_lock(&zone->lock); 682 spin_lock(&zone->lock);
683 zone->all_unreclaimable = 0; 683 zone->all_unreclaimable = 0;
684 zone->pages_scanned = 0; 684 zone->pages_scanned = 0;
685 685
686 __free_one_page(page, zone, order, migratetype); 686 __free_one_page(page, zone, order, migratetype);
687 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 687 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
688 spin_unlock(&zone->lock); 688 spin_unlock(&zone->lock);
689 } 689 }
690 690
691 static bool free_pages_prepare(struct page *page, unsigned int order) 691 static bool free_pages_prepare(struct page *page, unsigned int order)
692 { 692 {
693 int i; 693 int i;
694 int bad = 0; 694 int bad = 0;
695 695
696 trace_mm_page_free(page, order); 696 trace_mm_page_free(page, order);
697 kmemcheck_free_shadow(page, order); 697 kmemcheck_free_shadow(page, order);
698 698
699 if (PageAnon(page)) 699 if (PageAnon(page))
700 page->mapping = NULL; 700 page->mapping = NULL;
701 for (i = 0; i < (1 << order); i++) 701 for (i = 0; i < (1 << order); i++)
702 bad += free_pages_check(page + i); 702 bad += free_pages_check(page + i);
703 if (bad) 703 if (bad)
704 return false; 704 return false;
705 705
706 if (!PageHighMem(page)) { 706 if (!PageHighMem(page)) {
707 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 707 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
708 debug_check_no_obj_freed(page_address(page), 708 debug_check_no_obj_freed(page_address(page),
709 PAGE_SIZE << order); 709 PAGE_SIZE << order);
710 } 710 }
711 arch_free_page(page, order); 711 arch_free_page(page, order);
712 kernel_map_pages(page, 1 << order, 0); 712 kernel_map_pages(page, 1 << order, 0);
713 713
714 return true; 714 return true;
715 } 715 }
716 716
717 static void __free_pages_ok(struct page *page, unsigned int order) 717 static void __free_pages_ok(struct page *page, unsigned int order)
718 { 718 {
719 unsigned long flags; 719 unsigned long flags;
720 int wasMlocked = __TestClearPageMlocked(page); 720 int wasMlocked = __TestClearPageMlocked(page);
721 721
722 if (!free_pages_prepare(page, order)) 722 if (!free_pages_prepare(page, order))
723 return; 723 return;
724 724
725 local_irq_save(flags); 725 local_irq_save(flags);
726 if (unlikely(wasMlocked)) 726 if (unlikely(wasMlocked))
727 free_page_mlock(page); 727 free_page_mlock(page);
728 __count_vm_events(PGFREE, 1 << order); 728 __count_vm_events(PGFREE, 1 << order);
729 free_one_page(page_zone(page), page, order, 729 free_one_page(page_zone(page), page, order,
730 get_pageblock_migratetype(page)); 730 get_pageblock_migratetype(page));
731 local_irq_restore(flags); 731 local_irq_restore(flags);
732 } 732 }
733 733
734 void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 734 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
735 { 735 {
736 unsigned int nr_pages = 1 << order; 736 unsigned int nr_pages = 1 << order;
737 unsigned int loop; 737 unsigned int loop;
738 738
739 prefetchw(page); 739 prefetchw(page);
740 for (loop = 0; loop < nr_pages; loop++) { 740 for (loop = 0; loop < nr_pages; loop++) {
741 struct page *p = &page[loop]; 741 struct page *p = &page[loop];
742 742
743 if (loop + 1 < nr_pages) 743 if (loop + 1 < nr_pages)
744 prefetchw(p + 1); 744 prefetchw(p + 1);
745 __ClearPageReserved(p); 745 __ClearPageReserved(p);
746 set_page_count(p, 0); 746 set_page_count(p, 0);
747 } 747 }
748 748
749 set_page_refcounted(page); 749 set_page_refcounted(page);
750 __free_pages(page, order); 750 __free_pages(page, order);
751 } 751 }
752 752
753 #ifdef CONFIG_CMA 753 #ifdef CONFIG_CMA
754 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ 754 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
755 void __init init_cma_reserved_pageblock(struct page *page) 755 void __init init_cma_reserved_pageblock(struct page *page)
756 { 756 {
757 unsigned i = pageblock_nr_pages; 757 unsigned i = pageblock_nr_pages;
758 struct page *p = page; 758 struct page *p = page;
759 759
760 do { 760 do {
761 __ClearPageReserved(p); 761 __ClearPageReserved(p);
762 set_page_count(p, 0); 762 set_page_count(p, 0);
763 } while (++p, --i); 763 } while (++p, --i);
764 764
765 set_page_refcounted(page); 765 set_page_refcounted(page);
766 set_pageblock_migratetype(page, MIGRATE_CMA); 766 set_pageblock_migratetype(page, MIGRATE_CMA);
767 __free_pages(page, pageblock_order); 767 __free_pages(page, pageblock_order);
768 totalram_pages += pageblock_nr_pages; 768 totalram_pages += pageblock_nr_pages;
769 } 769 }
770 #endif 770 #endif
771 771
772 /* 772 /*
773 * The order of subdivision here is critical for the IO subsystem. 773 * The order of subdivision here is critical for the IO subsystem.
774 * Please do not alter this order without good reasons and regression 774 * Please do not alter this order without good reasons and regression
775 * testing. Specifically, as large blocks of memory are subdivided, 775 * testing. Specifically, as large blocks of memory are subdivided,
776 * the order in which smaller blocks are delivered depends on the order 776 * the order in which smaller blocks are delivered depends on the order
777 * they're subdivided in this function. This is the primary factor 777 * they're subdivided in this function. This is the primary factor
778 * influencing the order in which pages are delivered to the IO 778 * influencing the order in which pages are delivered to the IO
779 * subsystem according to empirical testing, and this is also justified 779 * subsystem according to empirical testing, and this is also justified
780 * by considering the behavior of a buddy system containing a single 780 * by considering the behavior of a buddy system containing a single
781 * large block of memory acted on by a series of small allocations. 781 * large block of memory acted on by a series of small allocations.
782 * This behavior is a critical factor in sglist merging's success. 782 * This behavior is a critical factor in sglist merging's success.
783 * 783 *
784 * -- wli 784 * -- wli
785 */ 785 */
786 static inline void expand(struct zone *zone, struct page *page, 786 static inline void expand(struct zone *zone, struct page *page,
787 int low, int high, struct free_area *area, 787 int low, int high, struct free_area *area,
788 int migratetype) 788 int migratetype)
789 { 789 {
790 unsigned long size = 1 << high; 790 unsigned long size = 1 << high;
791 791
792 while (high > low) { 792 while (high > low) {
793 area--; 793 area--;
794 high--; 794 high--;
795 size >>= 1; 795 size >>= 1;
796 VM_BUG_ON(bad_range(zone, &page[size])); 796 VM_BUG_ON(bad_range(zone, &page[size]));
797 797
798 #ifdef CONFIG_DEBUG_PAGEALLOC 798 #ifdef CONFIG_DEBUG_PAGEALLOC
799 if (high < debug_guardpage_minorder()) { 799 if (high < debug_guardpage_minorder()) {
800 /* 800 /*
801 * Mark as guard pages (or page), that will allow to 801 * Mark as guard pages (or page), that will allow to
802 * merge back to allocator when buddy will be freed. 802 * merge back to allocator when buddy will be freed.
803 * Corresponding page table entries will not be touched, 803 * Corresponding page table entries will not be touched,
804 * pages will stay not present in virtual address space 804 * pages will stay not present in virtual address space
805 */ 805 */
806 INIT_LIST_HEAD(&page[size].lru); 806 INIT_LIST_HEAD(&page[size].lru);
807 set_page_guard_flag(&page[size]); 807 set_page_guard_flag(&page[size]);
808 set_page_private(&page[size], high); 808 set_page_private(&page[size], high);
809 /* Guard pages are not available for any usage */ 809 /* Guard pages are not available for any usage */
810 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); 810 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
811 continue; 811 continue;
812 } 812 }
813 #endif 813 #endif
814 list_add(&page[size].lru, &area->free_list[migratetype]); 814 list_add(&page[size].lru, &area->free_list[migratetype]);
815 area->nr_free++; 815 area->nr_free++;
816 set_page_order(&page[size], high); 816 set_page_order(&page[size], high);
817 } 817 }
818 } 818 }
819 819
820 /* 820 /*
821 * This page is about to be returned from the page allocator 821 * This page is about to be returned from the page allocator
822 */ 822 */
823 static inline int check_new_page(struct page *page) 823 static inline int check_new_page(struct page *page)
824 { 824 {
825 if (unlikely(page_mapcount(page) | 825 if (unlikely(page_mapcount(page) |
826 (page->mapping != NULL) | 826 (page->mapping != NULL) |
827 (atomic_read(&page->_count) != 0) | 827 (atomic_read(&page->_count) != 0) |
828 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 828 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
829 (mem_cgroup_bad_page_check(page)))) { 829 (mem_cgroup_bad_page_check(page)))) {
830 bad_page(page); 830 bad_page(page);
831 return 1; 831 return 1;
832 } 832 }
833 return 0; 833 return 0;
834 } 834 }
835 835
836 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 836 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
837 { 837 {
838 int i; 838 int i;
839 839
840 for (i = 0; i < (1 << order); i++) { 840 for (i = 0; i < (1 << order); i++) {
841 struct page *p = page + i; 841 struct page *p = page + i;
842 if (unlikely(check_new_page(p))) 842 if (unlikely(check_new_page(p)))
843 return 1; 843 return 1;
844 } 844 }
845 845
846 set_page_private(page, 0); 846 set_page_private(page, 0);
847 set_page_refcounted(page); 847 set_page_refcounted(page);
848 848
849 arch_alloc_page(page, order); 849 arch_alloc_page(page, order);
850 kernel_map_pages(page, 1 << order, 1); 850 kernel_map_pages(page, 1 << order, 1);
851 851
852 if (gfp_flags & __GFP_ZERO) 852 if (gfp_flags & __GFP_ZERO)
853 prep_zero_page(page, order, gfp_flags); 853 prep_zero_page(page, order, gfp_flags);
854 854
855 if (order && (gfp_flags & __GFP_COMP)) 855 if (order && (gfp_flags & __GFP_COMP))
856 prep_compound_page(page, order); 856 prep_compound_page(page, order);
857 857
858 return 0; 858 return 0;
859 } 859 }
860 860
861 /* 861 /*
862 * Go through the free lists for the given migratetype and remove 862 * Go through the free lists for the given migratetype and remove
863 * the smallest available page from the freelists 863 * the smallest available page from the freelists
864 */ 864 */
865 static inline 865 static inline
866 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 866 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
867 int migratetype) 867 int migratetype)
868 { 868 {
869 unsigned int current_order; 869 unsigned int current_order;
870 struct free_area * area; 870 struct free_area * area;
871 struct page *page; 871 struct page *page;
872 872
873 /* Find a page of the appropriate size in the preferred list */ 873 /* Find a page of the appropriate size in the preferred list */
874 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 874 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
875 area = &(zone->free_area[current_order]); 875 area = &(zone->free_area[current_order]);
876 if (list_empty(&area->free_list[migratetype])) 876 if (list_empty(&area->free_list[migratetype]))
877 continue; 877 continue;
878 878
879 page = list_entry(area->free_list[migratetype].next, 879 page = list_entry(area->free_list[migratetype].next,
880 struct page, lru); 880 struct page, lru);
881 list_del(&page->lru); 881 list_del(&page->lru);
882 rmv_page_order(page); 882 rmv_page_order(page);
883 area->nr_free--; 883 area->nr_free--;
884 expand(zone, page, order, current_order, area, migratetype); 884 expand(zone, page, order, current_order, area, migratetype);
885 return page; 885 return page;
886 } 886 }
887 887
888 return NULL; 888 return NULL;
889 } 889 }
890 890
891 891
892 /* 892 /*
893 * This array describes the order lists are fallen back to when 893 * This array describes the order lists are fallen back to when
894 * the free lists for the desirable migrate type are depleted 894 * the free lists for the desirable migrate type are depleted
895 */ 895 */
896 static int fallbacks[MIGRATE_TYPES][4] = { 896 static int fallbacks[MIGRATE_TYPES][4] = {
897 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 897 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
898 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 898 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
899 #ifdef CONFIG_CMA 899 #ifdef CONFIG_CMA
900 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 900 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
901 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 901 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
902 #else 902 #else
903 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 903 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
904 #endif 904 #endif
905 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 905 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
906 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 906 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
907 }; 907 };
908 908
909 /* 909 /*
910 * Move the free pages in a range to the free lists of the requested type. 910 * Move the free pages in a range to the free lists of the requested type.
911 * Note that start_page and end_pages are not aligned on a pageblock 911 * Note that start_page and end_pages are not aligned on a pageblock
912 * boundary. If alignment is required, use move_freepages_block() 912 * boundary. If alignment is required, use move_freepages_block()
913 */ 913 */
914 static int move_freepages(struct zone *zone, 914 static int move_freepages(struct zone *zone,
915 struct page *start_page, struct page *end_page, 915 struct page *start_page, struct page *end_page,
916 int migratetype) 916 int migratetype)
917 { 917 {
918 struct page *page; 918 struct page *page;
919 unsigned long order; 919 unsigned long order;
920 int pages_moved = 0; 920 int pages_moved = 0;
921 921
922 #ifndef CONFIG_HOLES_IN_ZONE 922 #ifndef CONFIG_HOLES_IN_ZONE
923 /* 923 /*
924 * page_zone is not safe to call in this context when 924 * page_zone is not safe to call in this context when
925 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 925 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
926 * anyway as we check zone boundaries in move_freepages_block(). 926 * anyway as we check zone boundaries in move_freepages_block().
927 * Remove at a later date when no bug reports exist related to 927 * Remove at a later date when no bug reports exist related to
928 * grouping pages by mobility 928 * grouping pages by mobility
929 */ 929 */
930 BUG_ON(page_zone(start_page) != page_zone(end_page)); 930 BUG_ON(page_zone(start_page) != page_zone(end_page));
931 #endif 931 #endif
932 932
933 for (page = start_page; page <= end_page;) { 933 for (page = start_page; page <= end_page;) {
934 /* Make sure we are not inadvertently changing nodes */ 934 /* Make sure we are not inadvertently changing nodes */
935 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 935 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
936 936
937 if (!pfn_valid_within(page_to_pfn(page))) { 937 if (!pfn_valid_within(page_to_pfn(page))) {
938 page++; 938 page++;
939 continue; 939 continue;
940 } 940 }
941 941
942 if (!PageBuddy(page)) { 942 if (!PageBuddy(page)) {
943 page++; 943 page++;
944 continue; 944 continue;
945 } 945 }
946 946
947 order = page_order(page); 947 order = page_order(page);
948 list_move(&page->lru, 948 list_move(&page->lru,
949 &zone->free_area[order].free_list[migratetype]); 949 &zone->free_area[order].free_list[migratetype]);
950 page += 1 << order; 950 page += 1 << order;
951 pages_moved += 1 << order; 951 pages_moved += 1 << order;
952 } 952 }
953 953
954 return pages_moved; 954 return pages_moved;
955 } 955 }
956 956
957 static int move_freepages_block(struct zone *zone, struct page *page, 957 static int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 958 int migratetype)
959 { 959 {
960 unsigned long start_pfn, end_pfn; 960 unsigned long start_pfn, end_pfn;
961 struct page *start_page, *end_page; 961 struct page *start_page, *end_page;
962 962
963 start_pfn = page_to_pfn(page); 963 start_pfn = page_to_pfn(page);
964 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 964 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
965 start_page = pfn_to_page(start_pfn); 965 start_page = pfn_to_page(start_pfn);
966 end_page = start_page + pageblock_nr_pages - 1; 966 end_page = start_page + pageblock_nr_pages - 1;
967 end_pfn = start_pfn + pageblock_nr_pages - 1; 967 end_pfn = start_pfn + pageblock_nr_pages - 1;
968 968
969 /* Do not cross zone boundaries */ 969 /* Do not cross zone boundaries */
970 if (start_pfn < zone->zone_start_pfn) 970 if (start_pfn < zone->zone_start_pfn)
971 start_page = page; 971 start_page = page;
972 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 972 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
973 return 0; 973 return 0;
974 974
975 return move_freepages(zone, start_page, end_page, migratetype); 975 return move_freepages(zone, start_page, end_page, migratetype);
976 } 976 }
977 977
978 static void change_pageblock_range(struct page *pageblock_page, 978 static void change_pageblock_range(struct page *pageblock_page,
979 int start_order, int migratetype) 979 int start_order, int migratetype)
980 { 980 {
981 int nr_pageblocks = 1 << (start_order - pageblock_order); 981 int nr_pageblocks = 1 << (start_order - pageblock_order);
982 982
983 while (nr_pageblocks--) { 983 while (nr_pageblocks--) {
984 set_pageblock_migratetype(pageblock_page, migratetype); 984 set_pageblock_migratetype(pageblock_page, migratetype);
985 pageblock_page += pageblock_nr_pages; 985 pageblock_page += pageblock_nr_pages;
986 } 986 }
987 } 987 }
988 988
989 /* Remove an element from the buddy allocator from the fallback list */ 989 /* Remove an element from the buddy allocator from the fallback list */
990 static inline struct page * 990 static inline struct page *
991 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 991 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
992 { 992 {
993 struct free_area * area; 993 struct free_area * area;
994 int current_order; 994 int current_order;
995 struct page *page; 995 struct page *page;
996 int migratetype, i; 996 int migratetype, i;
997 997
998 /* Find the largest possible block of pages in the other list */ 998 /* Find the largest possible block of pages in the other list */
999 for (current_order = MAX_ORDER-1; current_order >= order; 999 for (current_order = MAX_ORDER-1; current_order >= order;
1000 --current_order) { 1000 --current_order) {
1001 for (i = 0;; i++) { 1001 for (i = 0;; i++) {
1002 migratetype = fallbacks[start_migratetype][i]; 1002 migratetype = fallbacks[start_migratetype][i];
1003 1003
1004 /* MIGRATE_RESERVE handled later if necessary */ 1004 /* MIGRATE_RESERVE handled later if necessary */
1005 if (migratetype == MIGRATE_RESERVE) 1005 if (migratetype == MIGRATE_RESERVE)
1006 break; 1006 break;
1007 1007
1008 area = &(zone->free_area[current_order]); 1008 area = &(zone->free_area[current_order]);
1009 if (list_empty(&area->free_list[migratetype])) 1009 if (list_empty(&area->free_list[migratetype]))
1010 continue; 1010 continue;
1011 1011
1012 page = list_entry(area->free_list[migratetype].next, 1012 page = list_entry(area->free_list[migratetype].next,
1013 struct page, lru); 1013 struct page, lru);
1014 area->nr_free--; 1014 area->nr_free--;
1015 1015
1016 /* 1016 /*
1017 * If breaking a large block of pages, move all free 1017 * If breaking a large block of pages, move all free
1018 * pages to the preferred allocation list. If falling 1018 * pages to the preferred allocation list. If falling
1019 * back for a reclaimable kernel allocation, be more 1019 * back for a reclaimable kernel allocation, be more
1020 * aggressive about taking ownership of free pages 1020 * aggressive about taking ownership of free pages
1021 * 1021 *
1022 * On the other hand, never change migration 1022 * On the other hand, never change migration
1023 * type of MIGRATE_CMA pageblocks nor move CMA 1023 * type of MIGRATE_CMA pageblocks nor move CMA
1024 * pages on different free lists. We don't 1024 * pages on different free lists. We don't
1025 * want unmovable pages to be allocated from 1025 * want unmovable pages to be allocated from
1026 * MIGRATE_CMA areas. 1026 * MIGRATE_CMA areas.
1027 */ 1027 */
1028 if (!is_migrate_cma(migratetype) && 1028 if (!is_migrate_cma(migratetype) &&
1029 (unlikely(current_order >= pageblock_order / 2) || 1029 (unlikely(current_order >= pageblock_order / 2) ||
1030 start_migratetype == MIGRATE_RECLAIMABLE || 1030 start_migratetype == MIGRATE_RECLAIMABLE ||
1031 page_group_by_mobility_disabled)) { 1031 page_group_by_mobility_disabled)) {
1032 int pages; 1032 int pages;
1033 pages = move_freepages_block(zone, page, 1033 pages = move_freepages_block(zone, page,
1034 start_migratetype); 1034 start_migratetype);
1035 1035
1036 /* Claim the whole block if over half of it is free */ 1036 /* Claim the whole block if over half of it is free */
1037 if (pages >= (1 << (pageblock_order-1)) || 1037 if (pages >= (1 << (pageblock_order-1)) ||
1038 page_group_by_mobility_disabled) 1038 page_group_by_mobility_disabled)
1039 set_pageblock_migratetype(page, 1039 set_pageblock_migratetype(page,
1040 start_migratetype); 1040 start_migratetype);
1041 1041
1042 migratetype = start_migratetype; 1042 migratetype = start_migratetype;
1043 } 1043 }
1044 1044
1045 /* Remove the page from the freelists */ 1045 /* Remove the page from the freelists */
1046 list_del(&page->lru); 1046 list_del(&page->lru);
1047 rmv_page_order(page); 1047 rmv_page_order(page);
1048 1048
1049 /* Take ownership for orders >= pageblock_order */ 1049 /* Take ownership for orders >= pageblock_order */
1050 if (current_order >= pageblock_order && 1050 if (current_order >= pageblock_order &&
1051 !is_migrate_cma(migratetype)) 1051 !is_migrate_cma(migratetype))
1052 change_pageblock_range(page, current_order, 1052 change_pageblock_range(page, current_order,
1053 start_migratetype); 1053 start_migratetype);
1054 1054
1055 expand(zone, page, order, current_order, area, 1055 expand(zone, page, order, current_order, area,
1056 is_migrate_cma(migratetype) 1056 is_migrate_cma(migratetype)
1057 ? migratetype : start_migratetype); 1057 ? migratetype : start_migratetype);
1058 1058
1059 trace_mm_page_alloc_extfrag(page, order, current_order, 1059 trace_mm_page_alloc_extfrag(page, order, current_order,
1060 start_migratetype, migratetype); 1060 start_migratetype, migratetype);
1061 1061
1062 return page; 1062 return page;
1063 } 1063 }
1064 } 1064 }
1065 1065
1066 return NULL; 1066 return NULL;
1067 } 1067 }
1068 1068
1069 /* 1069 /*
1070 * Do the hard work of removing an element from the buddy allocator. 1070 * Do the hard work of removing an element from the buddy allocator.
1071 * Call me with the zone->lock already held. 1071 * Call me with the zone->lock already held.
1072 */ 1072 */
1073 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1073 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1074 int migratetype) 1074 int migratetype)
1075 { 1075 {
1076 struct page *page; 1076 struct page *page;
1077 1077
1078 retry_reserve: 1078 retry_reserve:
1079 page = __rmqueue_smallest(zone, order, migratetype); 1079 page = __rmqueue_smallest(zone, order, migratetype);
1080 1080
1081 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1081 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1082 page = __rmqueue_fallback(zone, order, migratetype); 1082 page = __rmqueue_fallback(zone, order, migratetype);
1083 1083
1084 /* 1084 /*
1085 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1085 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1086 * is used because __rmqueue_smallest is an inline function 1086 * is used because __rmqueue_smallest is an inline function
1087 * and we want just one call site 1087 * and we want just one call site
1088 */ 1088 */
1089 if (!page) { 1089 if (!page) {
1090 migratetype = MIGRATE_RESERVE; 1090 migratetype = MIGRATE_RESERVE;
1091 goto retry_reserve; 1091 goto retry_reserve;
1092 } 1092 }
1093 } 1093 }
1094 1094
1095 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1095 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1096 return page; 1096 return page;
1097 } 1097 }
1098 1098
1099 /* 1099 /*
1100 * Obtain a specified number of elements from the buddy allocator, all under 1100 * Obtain a specified number of elements from the buddy allocator, all under
1101 * a single hold of the lock, for efficiency. Add them to the supplied list. 1101 * a single hold of the lock, for efficiency. Add them to the supplied list.
1102 * Returns the number of new pages which were placed at *list. 1102 * Returns the number of new pages which were placed at *list.
1103 */ 1103 */
1104 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1104 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1105 unsigned long count, struct list_head *list, 1105 unsigned long count, struct list_head *list,
1106 int migratetype, int cold) 1106 int migratetype, int cold)
1107 { 1107 {
1108 int mt = migratetype, i; 1108 int mt = migratetype, i;
1109 1109
1110 spin_lock(&zone->lock); 1110 spin_lock(&zone->lock);
1111 for (i = 0; i < count; ++i) { 1111 for (i = 0; i < count; ++i) {
1112 struct page *page = __rmqueue(zone, order, migratetype); 1112 struct page *page = __rmqueue(zone, order, migratetype);
1113 if (unlikely(page == NULL)) 1113 if (unlikely(page == NULL))
1114 break; 1114 break;
1115 1115
1116 /* 1116 /*
1117 * Split buddy pages returned by expand() are received here 1117 * Split buddy pages returned by expand() are received here
1118 * in physical page order. The page is added to the callers and 1118 * in physical page order. The page is added to the callers and
1119 * list and the list head then moves forward. From the callers 1119 * list and the list head then moves forward. From the callers
1120 * perspective, the linked list is ordered by page number in 1120 * perspective, the linked list is ordered by page number in
1121 * some conditions. This is useful for IO devices that can 1121 * some conditions. This is useful for IO devices that can
1122 * merge IO requests if the physical pages are ordered 1122 * merge IO requests if the physical pages are ordered
1123 * properly. 1123 * properly.
1124 */ 1124 */
1125 if (likely(cold == 0)) 1125 if (likely(cold == 0))
1126 list_add(&page->lru, list); 1126 list_add(&page->lru, list);
1127 else 1127 else
1128 list_add_tail(&page->lru, list); 1128 list_add_tail(&page->lru, list);
1129 if (IS_ENABLED(CONFIG_CMA)) { 1129 if (IS_ENABLED(CONFIG_CMA)) {
1130 mt = get_pageblock_migratetype(page); 1130 mt = get_pageblock_migratetype(page);
1131 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1131 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1132 mt = migratetype; 1132 mt = migratetype;
1133 } 1133 }
1134 set_page_private(page, mt); 1134 set_page_private(page, mt);
1135 list = &page->lru; 1135 list = &page->lru;
1136 } 1136 }
1137 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1137 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1138 spin_unlock(&zone->lock); 1138 spin_unlock(&zone->lock);
1139 return i; 1139 return i;
1140 } 1140 }
1141 1141
1142 #ifdef CONFIG_NUMA 1142 #ifdef CONFIG_NUMA
1143 /* 1143 /*
1144 * Called from the vmstat counter updater to drain pagesets of this 1144 * Called from the vmstat counter updater to drain pagesets of this
1145 * currently executing processor on remote nodes after they have 1145 * currently executing processor on remote nodes after they have
1146 * expired. 1146 * expired.
1147 * 1147 *
1148 * Note that this function must be called with the thread pinned to 1148 * Note that this function must be called with the thread pinned to
1149 * a single processor. 1149 * a single processor.
1150 */ 1150 */
1151 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1151 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1152 { 1152 {
1153 unsigned long flags; 1153 unsigned long flags;
1154 int to_drain; 1154 int to_drain;
1155 1155
1156 local_irq_save(flags); 1156 local_irq_save(flags);
1157 if (pcp->count >= pcp->batch) 1157 if (pcp->count >= pcp->batch)
1158 to_drain = pcp->batch; 1158 to_drain = pcp->batch;
1159 else 1159 else
1160 to_drain = pcp->count; 1160 to_drain = pcp->count;
1161 if (to_drain > 0) { 1161 if (to_drain > 0) {
1162 free_pcppages_bulk(zone, to_drain, pcp); 1162 free_pcppages_bulk(zone, to_drain, pcp);
1163 pcp->count -= to_drain; 1163 pcp->count -= to_drain;
1164 } 1164 }
1165 local_irq_restore(flags); 1165 local_irq_restore(flags);
1166 } 1166 }
1167 #endif 1167 #endif
1168 1168
1169 /* 1169 /*
1170 * Drain pages of the indicated processor. 1170 * Drain pages of the indicated processor.
1171 * 1171 *
1172 * The processor must either be the current processor and the 1172 * The processor must either be the current processor and the
1173 * thread pinned to the current processor or a processor that 1173 * thread pinned to the current processor or a processor that
1174 * is not online. 1174 * is not online.
1175 */ 1175 */
1176 static void drain_pages(unsigned int cpu) 1176 static void drain_pages(unsigned int cpu)
1177 { 1177 {
1178 unsigned long flags; 1178 unsigned long flags;
1179 struct zone *zone; 1179 struct zone *zone;
1180 1180
1181 for_each_populated_zone(zone) { 1181 for_each_populated_zone(zone) {
1182 struct per_cpu_pageset *pset; 1182 struct per_cpu_pageset *pset;
1183 struct per_cpu_pages *pcp; 1183 struct per_cpu_pages *pcp;
1184 1184
1185 local_irq_save(flags); 1185 local_irq_save(flags);
1186 pset = per_cpu_ptr(zone->pageset, cpu); 1186 pset = per_cpu_ptr(zone->pageset, cpu);
1187 1187
1188 pcp = &pset->pcp; 1188 pcp = &pset->pcp;
1189 if (pcp->count) { 1189 if (pcp->count) {
1190 free_pcppages_bulk(zone, pcp->count, pcp); 1190 free_pcppages_bulk(zone, pcp->count, pcp);
1191 pcp->count = 0; 1191 pcp->count = 0;
1192 } 1192 }
1193 local_irq_restore(flags); 1193 local_irq_restore(flags);
1194 } 1194 }
1195 } 1195 }
1196 1196
1197 /* 1197 /*
1198 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1198 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1199 */ 1199 */
1200 void drain_local_pages(void *arg) 1200 void drain_local_pages(void *arg)
1201 { 1201 {
1202 drain_pages(smp_processor_id()); 1202 drain_pages(smp_processor_id());
1203 } 1203 }
1204 1204
1205 /* 1205 /*
1206 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1206 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1207 * 1207 *
1208 * Note that this code is protected against sending an IPI to an offline 1208 * Note that this code is protected against sending an IPI to an offline
1209 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1209 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1210 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1210 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1211 * nothing keeps CPUs from showing up after we populated the cpumask and 1211 * nothing keeps CPUs from showing up after we populated the cpumask and
1212 * before the call to on_each_cpu_mask(). 1212 * before the call to on_each_cpu_mask().
1213 */ 1213 */
1214 void drain_all_pages(void) 1214 void drain_all_pages(void)
1215 { 1215 {
1216 int cpu; 1216 int cpu;
1217 struct per_cpu_pageset *pcp; 1217 struct per_cpu_pageset *pcp;
1218 struct zone *zone; 1218 struct zone *zone;
1219 1219
1220 /* 1220 /*
1221 * Allocate in the BSS so we wont require allocation in 1221 * Allocate in the BSS so we wont require allocation in
1222 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1222 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1223 */ 1223 */
1224 static cpumask_t cpus_with_pcps; 1224 static cpumask_t cpus_with_pcps;
1225 1225
1226 /* 1226 /*
1227 * We don't care about racing with CPU hotplug event 1227 * We don't care about racing with CPU hotplug event
1228 * as offline notification will cause the notified 1228 * as offline notification will cause the notified
1229 * cpu to drain that CPU pcps and on_each_cpu_mask 1229 * cpu to drain that CPU pcps and on_each_cpu_mask
1230 * disables preemption as part of its processing 1230 * disables preemption as part of its processing
1231 */ 1231 */
1232 for_each_online_cpu(cpu) { 1232 for_each_online_cpu(cpu) {
1233 bool has_pcps = false; 1233 bool has_pcps = false;
1234 for_each_populated_zone(zone) { 1234 for_each_populated_zone(zone) {
1235 pcp = per_cpu_ptr(zone->pageset, cpu); 1235 pcp = per_cpu_ptr(zone->pageset, cpu);
1236 if (pcp->pcp.count) { 1236 if (pcp->pcp.count) {
1237 has_pcps = true; 1237 has_pcps = true;
1238 break; 1238 break;
1239 } 1239 }
1240 } 1240 }
1241 if (has_pcps) 1241 if (has_pcps)
1242 cpumask_set_cpu(cpu, &cpus_with_pcps); 1242 cpumask_set_cpu(cpu, &cpus_with_pcps);
1243 else 1243 else
1244 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1244 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1245 } 1245 }
1246 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1246 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1247 } 1247 }
1248 1248
1249 #ifdef CONFIG_HIBERNATION 1249 #ifdef CONFIG_HIBERNATION
1250 1250
1251 void mark_free_pages(struct zone *zone) 1251 void mark_free_pages(struct zone *zone)
1252 { 1252 {
1253 unsigned long pfn, max_zone_pfn; 1253 unsigned long pfn, max_zone_pfn;
1254 unsigned long flags; 1254 unsigned long flags;
1255 int order, t; 1255 int order, t;
1256 struct list_head *curr; 1256 struct list_head *curr;
1257 1257
1258 if (!zone->spanned_pages) 1258 if (!zone->spanned_pages)
1259 return; 1259 return;
1260 1260
1261 spin_lock_irqsave(&zone->lock, flags); 1261 spin_lock_irqsave(&zone->lock, flags);
1262 1262
1263 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1263 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1264 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1264 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1265 if (pfn_valid(pfn)) { 1265 if (pfn_valid(pfn)) {
1266 struct page *page = pfn_to_page(pfn); 1266 struct page *page = pfn_to_page(pfn);
1267 1267
1268 if (!swsusp_page_is_forbidden(page)) 1268 if (!swsusp_page_is_forbidden(page))
1269 swsusp_unset_page_free(page); 1269 swsusp_unset_page_free(page);
1270 } 1270 }
1271 1271
1272 for_each_migratetype_order(order, t) { 1272 for_each_migratetype_order(order, t) {
1273 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1273 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1274 unsigned long i; 1274 unsigned long i;
1275 1275
1276 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1276 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1277 for (i = 0; i < (1UL << order); i++) 1277 for (i = 0; i < (1UL << order); i++)
1278 swsusp_set_page_free(pfn_to_page(pfn + i)); 1278 swsusp_set_page_free(pfn_to_page(pfn + i));
1279 } 1279 }
1280 } 1280 }
1281 spin_unlock_irqrestore(&zone->lock, flags); 1281 spin_unlock_irqrestore(&zone->lock, flags);
1282 } 1282 }
1283 #endif /* CONFIG_PM */ 1283 #endif /* CONFIG_PM */
1284 1284
1285 /* 1285 /*
1286 * Free a 0-order page 1286 * Free a 0-order page
1287 * cold == 1 ? free a cold page : free a hot page 1287 * cold == 1 ? free a cold page : free a hot page
1288 */ 1288 */
1289 void free_hot_cold_page(struct page *page, int cold) 1289 void free_hot_cold_page(struct page *page, int cold)
1290 { 1290 {
1291 struct zone *zone = page_zone(page); 1291 struct zone *zone = page_zone(page);
1292 struct per_cpu_pages *pcp; 1292 struct per_cpu_pages *pcp;
1293 unsigned long flags; 1293 unsigned long flags;
1294 int migratetype; 1294 int migratetype;
1295 int wasMlocked = __TestClearPageMlocked(page); 1295 int wasMlocked = __TestClearPageMlocked(page);
1296 1296
1297 if (!free_pages_prepare(page, 0)) 1297 if (!free_pages_prepare(page, 0))
1298 return; 1298 return;
1299 1299
1300 migratetype = get_pageblock_migratetype(page); 1300 migratetype = get_pageblock_migratetype(page);
1301 set_page_private(page, migratetype); 1301 set_page_private(page, migratetype);
1302 local_irq_save(flags); 1302 local_irq_save(flags);
1303 if (unlikely(wasMlocked)) 1303 if (unlikely(wasMlocked))
1304 free_page_mlock(page); 1304 free_page_mlock(page);
1305 __count_vm_event(PGFREE); 1305 __count_vm_event(PGFREE);
1306 1306
1307 /* 1307 /*
1308 * We only track unmovable, reclaimable and movable on pcp lists. 1308 * We only track unmovable, reclaimable and movable on pcp lists.
1309 * Free ISOLATE pages back to the allocator because they are being 1309 * Free ISOLATE pages back to the allocator because they are being
1310 * offlined but treat RESERVE as movable pages so we can get those 1310 * offlined but treat RESERVE as movable pages so we can get those
1311 * areas back if necessary. Otherwise, we may have to free 1311 * areas back if necessary. Otherwise, we may have to free
1312 * excessively into the page allocator 1312 * excessively into the page allocator
1313 */ 1313 */
1314 if (migratetype >= MIGRATE_PCPTYPES) { 1314 if (migratetype >= MIGRATE_PCPTYPES) {
1315 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1315 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1316 free_one_page(zone, page, 0, migratetype); 1316 free_one_page(zone, page, 0, migratetype);
1317 goto out; 1317 goto out;
1318 } 1318 }
1319 migratetype = MIGRATE_MOVABLE; 1319 migratetype = MIGRATE_MOVABLE;
1320 } 1320 }
1321 1321
1322 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1322 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1323 if (cold) 1323 if (cold)
1324 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1324 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1325 else 1325 else
1326 list_add(&page->lru, &pcp->lists[migratetype]); 1326 list_add(&page->lru, &pcp->lists[migratetype]);
1327 pcp->count++; 1327 pcp->count++;
1328 if (pcp->count >= pcp->high) { 1328 if (pcp->count >= pcp->high) {
1329 free_pcppages_bulk(zone, pcp->batch, pcp); 1329 free_pcppages_bulk(zone, pcp->batch, pcp);
1330 pcp->count -= pcp->batch; 1330 pcp->count -= pcp->batch;
1331 } 1331 }
1332 1332
1333 out: 1333 out:
1334 local_irq_restore(flags); 1334 local_irq_restore(flags);
1335 } 1335 }
1336 1336
1337 /* 1337 /*
1338 * Free a list of 0-order pages 1338 * Free a list of 0-order pages
1339 */ 1339 */
1340 void free_hot_cold_page_list(struct list_head *list, int cold) 1340 void free_hot_cold_page_list(struct list_head *list, int cold)
1341 { 1341 {
1342 struct page *page, *next; 1342 struct page *page, *next;
1343 1343
1344 list_for_each_entry_safe(page, next, list, lru) { 1344 list_for_each_entry_safe(page, next, list, lru) {
1345 trace_mm_page_free_batched(page, cold); 1345 trace_mm_page_free_batched(page, cold);
1346 free_hot_cold_page(page, cold); 1346 free_hot_cold_page(page, cold);
1347 } 1347 }
1348 } 1348 }
1349 1349
1350 /* 1350 /*
1351 * split_page takes a non-compound higher-order page, and splits it into 1351 * split_page takes a non-compound higher-order page, and splits it into
1352 * n (1<<order) sub-pages: page[0..n] 1352 * n (1<<order) sub-pages: page[0..n]
1353 * Each sub-page must be freed individually. 1353 * Each sub-page must be freed individually.
1354 * 1354 *
1355 * Note: this is probably too low level an operation for use in drivers. 1355 * Note: this is probably too low level an operation for use in drivers.
1356 * Please consult with lkml before using this in your driver. 1356 * Please consult with lkml before using this in your driver.
1357 */ 1357 */
1358 void split_page(struct page *page, unsigned int order) 1358 void split_page(struct page *page, unsigned int order)
1359 { 1359 {
1360 int i; 1360 int i;
1361 1361
1362 VM_BUG_ON(PageCompound(page)); 1362 VM_BUG_ON(PageCompound(page));
1363 VM_BUG_ON(!page_count(page)); 1363 VM_BUG_ON(!page_count(page));
1364 1364
1365 #ifdef CONFIG_KMEMCHECK 1365 #ifdef CONFIG_KMEMCHECK
1366 /* 1366 /*
1367 * Split shadow pages too, because free(page[0]) would 1367 * Split shadow pages too, because free(page[0]) would
1368 * otherwise free the whole shadow. 1368 * otherwise free the whole shadow.
1369 */ 1369 */
1370 if (kmemcheck_page_is_tracked(page)) 1370 if (kmemcheck_page_is_tracked(page))
1371 split_page(virt_to_page(page[0].shadow), order); 1371 split_page(virt_to_page(page[0].shadow), order);
1372 #endif 1372 #endif
1373 1373
1374 for (i = 1; i < (1 << order); i++) 1374 for (i = 1; i < (1 << order); i++)
1375 set_page_refcounted(page + i); 1375 set_page_refcounted(page + i);
1376 } 1376 }
1377 1377
1378 /* 1378 /*
1379 * Similar to split_page except the page is already free. As this is only 1379 * Similar to split_page except the page is already free. As this is only
1380 * being used for migration, the migratetype of the block also changes. 1380 * being used for migration, the migratetype of the block also changes.
1381 * As this is called with interrupts disabled, the caller is responsible 1381 * As this is called with interrupts disabled, the caller is responsible
1382 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1382 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1383 * are enabled. 1383 * are enabled.
1384 * 1384 *
1385 * Note: this is probably too low level an operation for use in drivers. 1385 * Note: this is probably too low level an operation for use in drivers.
1386 * Please consult with lkml before using this in your driver. 1386 * Please consult with lkml before using this in your driver.
1387 */ 1387 */
1388 int split_free_page(struct page *page) 1388 int split_free_page(struct page *page)
1389 { 1389 {
1390 unsigned int order; 1390 unsigned int order;
1391 unsigned long watermark; 1391 unsigned long watermark;
1392 struct zone *zone; 1392 struct zone *zone;
1393 1393
1394 BUG_ON(!PageBuddy(page)); 1394 BUG_ON(!PageBuddy(page));
1395 1395
1396 zone = page_zone(page); 1396 zone = page_zone(page);
1397 order = page_order(page); 1397 order = page_order(page);
1398 1398
1399 /* Obey watermarks as if the page was being allocated */ 1399 /* Obey watermarks as if the page was being allocated */
1400 watermark = low_wmark_pages(zone) + (1 << order); 1400 watermark = low_wmark_pages(zone) + (1 << order);
1401 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1401 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1402 return 0; 1402 return 0;
1403 1403
1404 /* Remove page from free list */ 1404 /* Remove page from free list */
1405 list_del(&page->lru); 1405 list_del(&page->lru);
1406 zone->free_area[order].nr_free--; 1406 zone->free_area[order].nr_free--;
1407 rmv_page_order(page); 1407 rmv_page_order(page);
1408 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); 1408 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1409 1409
1410 /* Split into individual pages */ 1410 /* Split into individual pages */
1411 set_page_refcounted(page); 1411 set_page_refcounted(page);
1412 split_page(page, order); 1412 split_page(page, order);
1413 1413
1414 if (order >= pageblock_order - 1) { 1414 if (order >= pageblock_order - 1) {
1415 struct page *endpage = page + (1 << order) - 1; 1415 struct page *endpage = page + (1 << order) - 1;
1416 for (; page < endpage; page += pageblock_nr_pages) { 1416 for (; page < endpage; page += pageblock_nr_pages) {
1417 int mt = get_pageblock_migratetype(page); 1417 int mt = get_pageblock_migratetype(page);
1418 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1418 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1419 set_pageblock_migratetype(page, 1419 set_pageblock_migratetype(page,
1420 MIGRATE_MOVABLE); 1420 MIGRATE_MOVABLE);
1421 } 1421 }
1422 } 1422 }
1423 1423
1424 return 1 << order; 1424 return 1 << order;
1425 } 1425 }
1426 1426
1427 /* 1427 /*
1428 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1428 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1429 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1429 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1430 * or two. 1430 * or two.
1431 */ 1431 */
1432 static inline 1432 static inline
1433 struct page *buffered_rmqueue(struct zone *preferred_zone, 1433 struct page *buffered_rmqueue(struct zone *preferred_zone,
1434 struct zone *zone, int order, gfp_t gfp_flags, 1434 struct zone *zone, int order, gfp_t gfp_flags,
1435 int migratetype) 1435 int migratetype)
1436 { 1436 {
1437 unsigned long flags; 1437 unsigned long flags;
1438 struct page *page; 1438 struct page *page;
1439 int cold = !!(gfp_flags & __GFP_COLD); 1439 int cold = !!(gfp_flags & __GFP_COLD);
1440 1440
1441 again: 1441 again:
1442 if (likely(order == 0)) { 1442 if (likely(order == 0)) {
1443 struct per_cpu_pages *pcp; 1443 struct per_cpu_pages *pcp;
1444 struct list_head *list; 1444 struct list_head *list;
1445 1445
1446 local_irq_save(flags); 1446 local_irq_save(flags);
1447 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1447 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1448 list = &pcp->lists[migratetype]; 1448 list = &pcp->lists[migratetype];
1449 if (list_empty(list)) { 1449 if (list_empty(list)) {
1450 pcp->count += rmqueue_bulk(zone, 0, 1450 pcp->count += rmqueue_bulk(zone, 0,
1451 pcp->batch, list, 1451 pcp->batch, list,
1452 migratetype, cold); 1452 migratetype, cold);
1453 if (unlikely(list_empty(list))) 1453 if (unlikely(list_empty(list)))
1454 goto failed; 1454 goto failed;
1455 } 1455 }
1456 1456
1457 if (cold) 1457 if (cold)
1458 page = list_entry(list->prev, struct page, lru); 1458 page = list_entry(list->prev, struct page, lru);
1459 else 1459 else
1460 page = list_entry(list->next, struct page, lru); 1460 page = list_entry(list->next, struct page, lru);
1461 1461
1462 list_del(&page->lru); 1462 list_del(&page->lru);
1463 pcp->count--; 1463 pcp->count--;
1464 } else { 1464 } else {
1465 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1465 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1466 /* 1466 /*
1467 * __GFP_NOFAIL is not to be used in new code. 1467 * __GFP_NOFAIL is not to be used in new code.
1468 * 1468 *
1469 * All __GFP_NOFAIL callers should be fixed so that they 1469 * All __GFP_NOFAIL callers should be fixed so that they
1470 * properly detect and handle allocation failures. 1470 * properly detect and handle allocation failures.
1471 * 1471 *
1472 * We most definitely don't want callers attempting to 1472 * We most definitely don't want callers attempting to
1473 * allocate greater than order-1 page units with 1473 * allocate greater than order-1 page units with
1474 * __GFP_NOFAIL. 1474 * __GFP_NOFAIL.
1475 */ 1475 */
1476 WARN_ON_ONCE(order > 1); 1476 WARN_ON_ONCE(order > 1);
1477 } 1477 }
1478 spin_lock_irqsave(&zone->lock, flags); 1478 spin_lock_irqsave(&zone->lock, flags);
1479 page = __rmqueue(zone, order, migratetype); 1479 page = __rmqueue(zone, order, migratetype);
1480 spin_unlock(&zone->lock); 1480 spin_unlock(&zone->lock);
1481 if (!page) 1481 if (!page)
1482 goto failed; 1482 goto failed;
1483 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1483 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1484 } 1484 }
1485 1485
1486 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1486 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1487 zone_statistics(preferred_zone, zone, gfp_flags); 1487 zone_statistics(preferred_zone, zone, gfp_flags);
1488 local_irq_restore(flags); 1488 local_irq_restore(flags);
1489 1489
1490 VM_BUG_ON(bad_range(zone, page)); 1490 VM_BUG_ON(bad_range(zone, page));
1491 if (prep_new_page(page, order, gfp_flags)) 1491 if (prep_new_page(page, order, gfp_flags))
1492 goto again; 1492 goto again;
1493 return page; 1493 return page;
1494 1494
1495 failed: 1495 failed:
1496 local_irq_restore(flags); 1496 local_irq_restore(flags);
1497 return NULL; 1497 return NULL;
1498 } 1498 }
1499 1499
1500 /* The ALLOC_WMARK bits are used as an index to zone->watermark */ 1500 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
1501 #define ALLOC_WMARK_MIN WMARK_MIN 1501 #define ALLOC_WMARK_MIN WMARK_MIN
1502 #define ALLOC_WMARK_LOW WMARK_LOW 1502 #define ALLOC_WMARK_LOW WMARK_LOW
1503 #define ALLOC_WMARK_HIGH WMARK_HIGH 1503 #define ALLOC_WMARK_HIGH WMARK_HIGH
1504 #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ 1504 #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1505 1505
1506 /* Mask to get the watermark bits */ 1506 /* Mask to get the watermark bits */
1507 #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) 1507 #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1508 1508
1509 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 1509 #define ALLOC_HARDER 0x10 /* try to alloc harder */
1510 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1510 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1511 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1511 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1512 1512
1513 #ifdef CONFIG_FAIL_PAGE_ALLOC 1513 #ifdef CONFIG_FAIL_PAGE_ALLOC
1514 1514
1515 static struct { 1515 static struct {
1516 struct fault_attr attr; 1516 struct fault_attr attr;
1517 1517
1518 u32 ignore_gfp_highmem; 1518 u32 ignore_gfp_highmem;
1519 u32 ignore_gfp_wait; 1519 u32 ignore_gfp_wait;
1520 u32 min_order; 1520 u32 min_order;
1521 } fail_page_alloc = { 1521 } fail_page_alloc = {
1522 .attr = FAULT_ATTR_INITIALIZER, 1522 .attr = FAULT_ATTR_INITIALIZER,
1523 .ignore_gfp_wait = 1, 1523 .ignore_gfp_wait = 1,
1524 .ignore_gfp_highmem = 1, 1524 .ignore_gfp_highmem = 1,
1525 .min_order = 1, 1525 .min_order = 1,
1526 }; 1526 };
1527 1527
1528 static int __init setup_fail_page_alloc(char *str) 1528 static int __init setup_fail_page_alloc(char *str)
1529 { 1529 {
1530 return setup_fault_attr(&fail_page_alloc.attr, str); 1530 return setup_fault_attr(&fail_page_alloc.attr, str);
1531 } 1531 }
1532 __setup("fail_page_alloc=", setup_fail_page_alloc); 1532 __setup("fail_page_alloc=", setup_fail_page_alloc);
1533 1533
1534 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1534 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1535 { 1535 {
1536 if (order < fail_page_alloc.min_order) 1536 if (order < fail_page_alloc.min_order)
1537 return false; 1537 return false;
1538 if (gfp_mask & __GFP_NOFAIL) 1538 if (gfp_mask & __GFP_NOFAIL)
1539 return false; 1539 return false;
1540 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1540 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1541 return false; 1541 return false;
1542 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1542 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1543 return false; 1543 return false;
1544 1544
1545 return should_fail(&fail_page_alloc.attr, 1 << order); 1545 return should_fail(&fail_page_alloc.attr, 1 << order);
1546 } 1546 }
1547 1547
1548 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1548 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1549 1549
1550 static int __init fail_page_alloc_debugfs(void) 1550 static int __init fail_page_alloc_debugfs(void)
1551 { 1551 {
1552 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1552 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1553 struct dentry *dir; 1553 struct dentry *dir;
1554 1554
1555 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1555 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1556 &fail_page_alloc.attr); 1556 &fail_page_alloc.attr);
1557 if (IS_ERR(dir)) 1557 if (IS_ERR(dir))
1558 return PTR_ERR(dir); 1558 return PTR_ERR(dir);
1559 1559
1560 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1560 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1561 &fail_page_alloc.ignore_gfp_wait)) 1561 &fail_page_alloc.ignore_gfp_wait))
1562 goto fail; 1562 goto fail;
1563 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1563 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1564 &fail_page_alloc.ignore_gfp_highmem)) 1564 &fail_page_alloc.ignore_gfp_highmem))
1565 goto fail; 1565 goto fail;
1566 if (!debugfs_create_u32("min-order", mode, dir, 1566 if (!debugfs_create_u32("min-order", mode, dir,
1567 &fail_page_alloc.min_order)) 1567 &fail_page_alloc.min_order))
1568 goto fail; 1568 goto fail;
1569 1569
1570 return 0; 1570 return 0;
1571 fail: 1571 fail:
1572 debugfs_remove_recursive(dir); 1572 debugfs_remove_recursive(dir);
1573 1573
1574 return -ENOMEM; 1574 return -ENOMEM;
1575 } 1575 }
1576 1576
1577 late_initcall(fail_page_alloc_debugfs); 1577 late_initcall(fail_page_alloc_debugfs);
1578 1578
1579 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1579 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1580 1580
1581 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1581 #else /* CONFIG_FAIL_PAGE_ALLOC */
1582 1582
1583 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1583 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1584 { 1584 {
1585 return false; 1585 return false;
1586 } 1586 }
1587 1587
1588 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1588 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1589 1589
1590 /* 1590 /*
1591 * Return true if free pages are above 'mark'. This takes into account the order 1591 * Return true if free pages are above 'mark'. This takes into account the order
1592 * of the allocation. 1592 * of the allocation.
1593 */ 1593 */
1594 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1594 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1595 int classzone_idx, int alloc_flags, long free_pages) 1595 int classzone_idx, int alloc_flags, long free_pages)
1596 { 1596 {
1597 /* free_pages my go negative - that's OK */ 1597 /* free_pages my go negative - that's OK */
1598 long min = mark; 1598 long min = mark;
1599 int o; 1599 int o;
1600 1600
1601 free_pages -= (1 << order) - 1; 1601 free_pages -= (1 << order) - 1;
1602 if (alloc_flags & ALLOC_HIGH) 1602 if (alloc_flags & ALLOC_HIGH)
1603 min -= min / 2; 1603 min -= min / 2;
1604 if (alloc_flags & ALLOC_HARDER) 1604 if (alloc_flags & ALLOC_HARDER)
1605 min -= min / 4; 1605 min -= min / 4;
1606 1606
1607 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1607 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1608 return false; 1608 return false;
1609 for (o = 0; o < order; o++) { 1609 for (o = 0; o < order; o++) {
1610 /* At the next order, this order's pages become unavailable */ 1610 /* At the next order, this order's pages become unavailable */
1611 free_pages -= z->free_area[o].nr_free << o; 1611 free_pages -= z->free_area[o].nr_free << o;
1612 1612
1613 /* Require fewer higher order pages to be free */ 1613 /* Require fewer higher order pages to be free */
1614 min >>= 1; 1614 min >>= 1;
1615 1615
1616 if (free_pages <= min) 1616 if (free_pages <= min)
1617 return false; 1617 return false;
1618 } 1618 }
1619 return true; 1619 return true;
1620 } 1620 }
1621 1621
1622 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1622 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1623 int classzone_idx, int alloc_flags) 1623 int classzone_idx, int alloc_flags)
1624 { 1624 {
1625 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1625 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1626 zone_page_state(z, NR_FREE_PAGES)); 1626 zone_page_state(z, NR_FREE_PAGES));
1627 } 1627 }
1628 1628
1629 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1629 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1630 int classzone_idx, int alloc_flags) 1630 int classzone_idx, int alloc_flags)
1631 { 1631 {
1632 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1632 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1633 1633
1634 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1634 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1635 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1635 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1636 1636
1637 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1637 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1638 free_pages); 1638 free_pages);
1639 } 1639 }
1640 1640
1641 #ifdef CONFIG_NUMA 1641 #ifdef CONFIG_NUMA
1642 /* 1642 /*
1643 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1643 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1644 * skip over zones that are not allowed by the cpuset, or that have 1644 * skip over zones that are not allowed by the cpuset, or that have
1645 * been recently (in last second) found to be nearly full. See further 1645 * been recently (in last second) found to be nearly full. See further
1646 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1646 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1647 * that have to skip over a lot of full or unallowed zones. 1647 * that have to skip over a lot of full or unallowed zones.
1648 * 1648 *
1649 * If the zonelist cache is present in the passed in zonelist, then 1649 * If the zonelist cache is present in the passed in zonelist, then
1650 * returns a pointer to the allowed node mask (either the current 1650 * returns a pointer to the allowed node mask (either the current
1651 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1651 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1652 * 1652 *
1653 * If the zonelist cache is not available for this zonelist, does 1653 * If the zonelist cache is not available for this zonelist, does
1654 * nothing and returns NULL. 1654 * nothing and returns NULL.
1655 * 1655 *
1656 * If the fullzones BITMAP in the zonelist cache is stale (more than 1656 * If the fullzones BITMAP in the zonelist cache is stale (more than
1657 * a second since last zap'd) then we zap it out (clear its bits.) 1657 * a second since last zap'd) then we zap it out (clear its bits.)
1658 * 1658 *
1659 * We hold off even calling zlc_setup, until after we've checked the 1659 * We hold off even calling zlc_setup, until after we've checked the
1660 * first zone in the zonelist, on the theory that most allocations will 1660 * first zone in the zonelist, on the theory that most allocations will
1661 * be satisfied from that first zone, so best to examine that zone as 1661 * be satisfied from that first zone, so best to examine that zone as
1662 * quickly as we can. 1662 * quickly as we can.
1663 */ 1663 */
1664 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1664 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1665 { 1665 {
1666 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1666 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1667 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1667 nodemask_t *allowednodes; /* zonelist_cache approximation */
1668 1668
1669 zlc = zonelist->zlcache_ptr; 1669 zlc = zonelist->zlcache_ptr;
1670 if (!zlc) 1670 if (!zlc)
1671 return NULL; 1671 return NULL;
1672 1672
1673 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1673 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1674 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1674 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1675 zlc->last_full_zap = jiffies; 1675 zlc->last_full_zap = jiffies;
1676 } 1676 }
1677 1677
1678 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1678 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1679 &cpuset_current_mems_allowed : 1679 &cpuset_current_mems_allowed :
1680 &node_states[N_HIGH_MEMORY]; 1680 &node_states[N_HIGH_MEMORY];
1681 return allowednodes; 1681 return allowednodes;
1682 } 1682 }
1683 1683
1684 /* 1684 /*
1685 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1685 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1686 * if it is worth looking at further for free memory: 1686 * if it is worth looking at further for free memory:
1687 * 1) Check that the zone isn't thought to be full (doesn't have its 1687 * 1) Check that the zone isn't thought to be full (doesn't have its
1688 * bit set in the zonelist_cache fullzones BITMAP). 1688 * bit set in the zonelist_cache fullzones BITMAP).
1689 * 2) Check that the zones node (obtained from the zonelist_cache 1689 * 2) Check that the zones node (obtained from the zonelist_cache
1690 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1690 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1691 * Return true (non-zero) if zone is worth looking at further, or 1691 * Return true (non-zero) if zone is worth looking at further, or
1692 * else return false (zero) if it is not. 1692 * else return false (zero) if it is not.
1693 * 1693 *
1694 * This check -ignores- the distinction between various watermarks, 1694 * This check -ignores- the distinction between various watermarks,
1695 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1695 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1696 * found to be full for any variation of these watermarks, it will 1696 * found to be full for any variation of these watermarks, it will
1697 * be considered full for up to one second by all requests, unless 1697 * be considered full for up to one second by all requests, unless
1698 * we are so low on memory on all allowed nodes that we are forced 1698 * we are so low on memory on all allowed nodes that we are forced
1699 * into the second scan of the zonelist. 1699 * into the second scan of the zonelist.
1700 * 1700 *
1701 * In the second scan we ignore this zonelist cache and exactly 1701 * In the second scan we ignore this zonelist cache and exactly
1702 * apply the watermarks to all zones, even it is slower to do so. 1702 * apply the watermarks to all zones, even it is slower to do so.
1703 * We are low on memory in the second scan, and should leave no stone 1703 * We are low on memory in the second scan, and should leave no stone
1704 * unturned looking for a free page. 1704 * unturned looking for a free page.
1705 */ 1705 */
1706 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1706 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1707 nodemask_t *allowednodes) 1707 nodemask_t *allowednodes)
1708 { 1708 {
1709 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1709 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1710 int i; /* index of *z in zonelist zones */ 1710 int i; /* index of *z in zonelist zones */
1711 int n; /* node that zone *z is on */ 1711 int n; /* node that zone *z is on */
1712 1712
1713 zlc = zonelist->zlcache_ptr; 1713 zlc = zonelist->zlcache_ptr;
1714 if (!zlc) 1714 if (!zlc)
1715 return 1; 1715 return 1;
1716 1716
1717 i = z - zonelist->_zonerefs; 1717 i = z - zonelist->_zonerefs;
1718 n = zlc->z_to_n[i]; 1718 n = zlc->z_to_n[i];
1719 1719
1720 /* This zone is worth trying if it is allowed but not full */ 1720 /* This zone is worth trying if it is allowed but not full */
1721 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1721 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1722 } 1722 }
1723 1723
1724 /* 1724 /*
1725 * Given 'z' scanning a zonelist, set the corresponding bit in 1725 * Given 'z' scanning a zonelist, set the corresponding bit in
1726 * zlc->fullzones, so that subsequent attempts to allocate a page 1726 * zlc->fullzones, so that subsequent attempts to allocate a page
1727 * from that zone don't waste time re-examining it. 1727 * from that zone don't waste time re-examining it.
1728 */ 1728 */
1729 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1729 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1730 { 1730 {
1731 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1731 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1732 int i; /* index of *z in zonelist zones */ 1732 int i; /* index of *z in zonelist zones */
1733 1733
1734 zlc = zonelist->zlcache_ptr; 1734 zlc = zonelist->zlcache_ptr;
1735 if (!zlc) 1735 if (!zlc)
1736 return; 1736 return;
1737 1737
1738 i = z - zonelist->_zonerefs; 1738 i = z - zonelist->_zonerefs;
1739 1739
1740 set_bit(i, zlc->fullzones); 1740 set_bit(i, zlc->fullzones);
1741 } 1741 }
1742 1742
1743 /* 1743 /*
1744 * clear all zones full, called after direct reclaim makes progress so that 1744 * clear all zones full, called after direct reclaim makes progress so that
1745 * a zone that was recently full is not skipped over for up to a second 1745 * a zone that was recently full is not skipped over for up to a second
1746 */ 1746 */
1747 static void zlc_clear_zones_full(struct zonelist *zonelist) 1747 static void zlc_clear_zones_full(struct zonelist *zonelist)
1748 { 1748 {
1749 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1749 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1750 1750
1751 zlc = zonelist->zlcache_ptr; 1751 zlc = zonelist->zlcache_ptr;
1752 if (!zlc) 1752 if (!zlc)
1753 return; 1753 return;
1754 1754
1755 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1755 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1756 } 1756 }
1757 1757
1758 #else /* CONFIG_NUMA */ 1758 #else /* CONFIG_NUMA */
1759 1759
1760 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1760 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1761 { 1761 {
1762 return NULL; 1762 return NULL;
1763 } 1763 }
1764 1764
1765 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1765 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1766 nodemask_t *allowednodes) 1766 nodemask_t *allowednodes)
1767 { 1767 {
1768 return 1; 1768 return 1;
1769 } 1769 }
1770 1770
1771 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1771 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1772 { 1772 {
1773 } 1773 }
1774 1774
1775 static void zlc_clear_zones_full(struct zonelist *zonelist) 1775 static void zlc_clear_zones_full(struct zonelist *zonelist)
1776 { 1776 {
1777 } 1777 }
1778 #endif /* CONFIG_NUMA */ 1778 #endif /* CONFIG_NUMA */
1779 1779
1780 /* 1780 /*
1781 * get_page_from_freelist goes through the zonelist trying to allocate 1781 * get_page_from_freelist goes through the zonelist trying to allocate
1782 * a page. 1782 * a page.
1783 */ 1783 */
1784 static struct page * 1784 static struct page *
1785 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1785 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1786 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1786 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1787 struct zone *preferred_zone, int migratetype) 1787 struct zone *preferred_zone, int migratetype)
1788 { 1788 {
1789 struct zoneref *z; 1789 struct zoneref *z;
1790 struct page *page = NULL; 1790 struct page *page = NULL;
1791 int classzone_idx; 1791 int classzone_idx;
1792 struct zone *zone; 1792 struct zone *zone;
1793 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1793 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1794 int zlc_active = 0; /* set if using zonelist_cache */ 1794 int zlc_active = 0; /* set if using zonelist_cache */
1795 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1795 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1796 1796
1797 classzone_idx = zone_idx(preferred_zone); 1797 classzone_idx = zone_idx(preferred_zone);
1798 zonelist_scan: 1798 zonelist_scan:
1799 /* 1799 /*
1800 * Scan zonelist, looking for a zone with enough free. 1800 * Scan zonelist, looking for a zone with enough free.
1801 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1801 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1802 */ 1802 */
1803 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1803 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1804 high_zoneidx, nodemask) { 1804 high_zoneidx, nodemask) {
1805 if (NUMA_BUILD && zlc_active && 1805 if (NUMA_BUILD && zlc_active &&
1806 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1806 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1807 continue; 1807 continue;
1808 if ((alloc_flags & ALLOC_CPUSET) && 1808 if ((alloc_flags & ALLOC_CPUSET) &&
1809 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1809 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1810 continue; 1810 continue;
1811 /* 1811 /*
1812 * When allocating a page cache page for writing, we 1812 * When allocating a page cache page for writing, we
1813 * want to get it from a zone that is within its dirty 1813 * want to get it from a zone that is within its dirty
1814 * limit, such that no single zone holds more than its 1814 * limit, such that no single zone holds more than its
1815 * proportional share of globally allowed dirty pages. 1815 * proportional share of globally allowed dirty pages.
1816 * The dirty limits take into account the zone's 1816 * The dirty limits take into account the zone's
1817 * lowmem reserves and high watermark so that kswapd 1817 * lowmem reserves and high watermark so that kswapd
1818 * should be able to balance it without having to 1818 * should be able to balance it without having to
1819 * write pages from its LRU list. 1819 * write pages from its LRU list.
1820 * 1820 *
1821 * This may look like it could increase pressure on 1821 * This may look like it could increase pressure on
1822 * lower zones by failing allocations in higher zones 1822 * lower zones by failing allocations in higher zones
1823 * before they are full. But the pages that do spill 1823 * before they are full. But the pages that do spill
1824 * over are limited as the lower zones are protected 1824 * over are limited as the lower zones are protected
1825 * by this very same mechanism. It should not become 1825 * by this very same mechanism. It should not become
1826 * a practical burden to them. 1826 * a practical burden to them.
1827 * 1827 *
1828 * XXX: For now, allow allocations to potentially 1828 * XXX: For now, allow allocations to potentially
1829 * exceed the per-zone dirty limit in the slowpath 1829 * exceed the per-zone dirty limit in the slowpath
1830 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1830 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1831 * which is important when on a NUMA setup the allowed 1831 * which is important when on a NUMA setup the allowed
1832 * zones are together not big enough to reach the 1832 * zones are together not big enough to reach the
1833 * global limit. The proper fix for these situations 1833 * global limit. The proper fix for these situations
1834 * will require awareness of zones in the 1834 * will require awareness of zones in the
1835 * dirty-throttling and the flusher threads. 1835 * dirty-throttling and the flusher threads.
1836 */ 1836 */
1837 if ((alloc_flags & ALLOC_WMARK_LOW) && 1837 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1838 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1838 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1839 goto this_zone_full; 1839 goto this_zone_full;
1840 1840
1841 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1841 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1842 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1842 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1843 unsigned long mark; 1843 unsigned long mark;
1844 int ret; 1844 int ret;
1845 1845
1846 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1846 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1847 if (zone_watermark_ok(zone, order, mark, 1847 if (zone_watermark_ok(zone, order, mark,
1848 classzone_idx, alloc_flags)) 1848 classzone_idx, alloc_flags))
1849 goto try_this_zone; 1849 goto try_this_zone;
1850 1850
1851 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1851 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1852 /* 1852 /*
1853 * we do zlc_setup if there are multiple nodes 1853 * we do zlc_setup if there are multiple nodes
1854 * and before considering the first zone allowed 1854 * and before considering the first zone allowed
1855 * by the cpuset. 1855 * by the cpuset.
1856 */ 1856 */
1857 allowednodes = zlc_setup(zonelist, alloc_flags); 1857 allowednodes = zlc_setup(zonelist, alloc_flags);
1858 zlc_active = 1; 1858 zlc_active = 1;
1859 did_zlc_setup = 1; 1859 did_zlc_setup = 1;
1860 } 1860 }
1861 1861
1862 if (zone_reclaim_mode == 0) 1862 if (zone_reclaim_mode == 0)
1863 goto this_zone_full; 1863 goto this_zone_full;
1864 1864
1865 /* 1865 /*
1866 * As we may have just activated ZLC, check if the first 1866 * As we may have just activated ZLC, check if the first
1867 * eligible zone has failed zone_reclaim recently. 1867 * eligible zone has failed zone_reclaim recently.
1868 */ 1868 */
1869 if (NUMA_BUILD && zlc_active && 1869 if (NUMA_BUILD && zlc_active &&
1870 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1870 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1871 continue; 1871 continue;
1872 1872
1873 ret = zone_reclaim(zone, gfp_mask, order); 1873 ret = zone_reclaim(zone, gfp_mask, order);
1874 switch (ret) { 1874 switch (ret) {
1875 case ZONE_RECLAIM_NOSCAN: 1875 case ZONE_RECLAIM_NOSCAN:
1876 /* did not scan */ 1876 /* did not scan */
1877 continue; 1877 continue;
1878 case ZONE_RECLAIM_FULL: 1878 case ZONE_RECLAIM_FULL:
1879 /* scanned but unreclaimable */ 1879 /* scanned but unreclaimable */
1880 continue; 1880 continue;
1881 default: 1881 default:
1882 /* did we reclaim enough */ 1882 /* did we reclaim enough */
1883 if (!zone_watermark_ok(zone, order, mark, 1883 if (!zone_watermark_ok(zone, order, mark,
1884 classzone_idx, alloc_flags)) 1884 classzone_idx, alloc_flags))
1885 goto this_zone_full; 1885 goto this_zone_full;
1886 } 1886 }
1887 } 1887 }
1888 1888
1889 try_this_zone: 1889 try_this_zone:
1890 page = buffered_rmqueue(preferred_zone, zone, order, 1890 page = buffered_rmqueue(preferred_zone, zone, order,
1891 gfp_mask, migratetype); 1891 gfp_mask, migratetype);
1892 if (page) 1892 if (page)
1893 break; 1893 break;
1894 this_zone_full: 1894 this_zone_full:
1895 if (NUMA_BUILD) 1895 if (NUMA_BUILD)
1896 zlc_mark_zone_full(zonelist, z); 1896 zlc_mark_zone_full(zonelist, z);
1897 } 1897 }
1898 1898
1899 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1899 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1900 /* Disable zlc cache for second zonelist scan */ 1900 /* Disable zlc cache for second zonelist scan */
1901 zlc_active = 0; 1901 zlc_active = 0;
1902 goto zonelist_scan; 1902 goto zonelist_scan;
1903 } 1903 }
1904 return page; 1904 return page;
1905 } 1905 }
1906 1906
1907 /* 1907 /*
1908 * Large machines with many possible nodes should not always dump per-node 1908 * Large machines with many possible nodes should not always dump per-node
1909 * meminfo in irq context. 1909 * meminfo in irq context.
1910 */ 1910 */
1911 static inline bool should_suppress_show_mem(void) 1911 static inline bool should_suppress_show_mem(void)
1912 { 1912 {
1913 bool ret = false; 1913 bool ret = false;
1914 1914
1915 #if NODES_SHIFT > 8 1915 #if NODES_SHIFT > 8
1916 ret = in_interrupt(); 1916 ret = in_interrupt();
1917 #endif 1917 #endif
1918 return ret; 1918 return ret;
1919 } 1919 }
1920 1920
1921 static DEFINE_RATELIMIT_STATE(nopage_rs, 1921 static DEFINE_RATELIMIT_STATE(nopage_rs,
1922 DEFAULT_RATELIMIT_INTERVAL, 1922 DEFAULT_RATELIMIT_INTERVAL,
1923 DEFAULT_RATELIMIT_BURST); 1923 DEFAULT_RATELIMIT_BURST);
1924 1924
1925 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 1925 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1926 { 1926 {
1927 unsigned int filter = SHOW_MEM_FILTER_NODES; 1927 unsigned int filter = SHOW_MEM_FILTER_NODES;
1928 1928
1929 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 1929 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1930 debug_guardpage_minorder() > 0) 1930 debug_guardpage_minorder() > 0)
1931 return; 1931 return;
1932 1932
1933 /* 1933 /*
1934 * This documents exceptions given to allocations in certain 1934 * This documents exceptions given to allocations in certain
1935 * contexts that are allowed to allocate outside current's set 1935 * contexts that are allowed to allocate outside current's set
1936 * of allowed nodes. 1936 * of allowed nodes.
1937 */ 1937 */
1938 if (!(gfp_mask & __GFP_NOMEMALLOC)) 1938 if (!(gfp_mask & __GFP_NOMEMALLOC))
1939 if (test_thread_flag(TIF_MEMDIE) || 1939 if (test_thread_flag(TIF_MEMDIE) ||
1940 (current->flags & (PF_MEMALLOC | PF_EXITING))) 1940 (current->flags & (PF_MEMALLOC | PF_EXITING)))
1941 filter &= ~SHOW_MEM_FILTER_NODES; 1941 filter &= ~SHOW_MEM_FILTER_NODES;
1942 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 1942 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1943 filter &= ~SHOW_MEM_FILTER_NODES; 1943 filter &= ~SHOW_MEM_FILTER_NODES;
1944 1944
1945 if (fmt) { 1945 if (fmt) {
1946 struct va_format vaf; 1946 struct va_format vaf;
1947 va_list args; 1947 va_list args;
1948 1948
1949 va_start(args, fmt); 1949 va_start(args, fmt);
1950 1950
1951 vaf.fmt = fmt; 1951 vaf.fmt = fmt;
1952 vaf.va = &args; 1952 vaf.va = &args;
1953 1953
1954 pr_warn("%pV", &vaf); 1954 pr_warn("%pV", &vaf);
1955 1955
1956 va_end(args); 1956 va_end(args);
1957 } 1957 }
1958 1958
1959 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 1959 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
1960 current->comm, order, gfp_mask); 1960 current->comm, order, gfp_mask);
1961 1961
1962 dump_stack(); 1962 dump_stack();
1963 if (!should_suppress_show_mem()) 1963 if (!should_suppress_show_mem())
1964 show_mem(filter); 1964 show_mem(filter);
1965 } 1965 }
1966 1966
1967 static inline int 1967 static inline int
1968 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1968 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1969 unsigned long did_some_progress, 1969 unsigned long did_some_progress,
1970 unsigned long pages_reclaimed) 1970 unsigned long pages_reclaimed)
1971 { 1971 {
1972 /* Do not loop if specifically requested */ 1972 /* Do not loop if specifically requested */
1973 if (gfp_mask & __GFP_NORETRY) 1973 if (gfp_mask & __GFP_NORETRY)
1974 return 0; 1974 return 0;
1975 1975
1976 /* Always retry if specifically requested */ 1976 /* Always retry if specifically requested */
1977 if (gfp_mask & __GFP_NOFAIL) 1977 if (gfp_mask & __GFP_NOFAIL)
1978 return 1; 1978 return 1;
1979 1979
1980 /* 1980 /*
1981 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 1981 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
1982 * making forward progress without invoking OOM. Suspend also disables 1982 * making forward progress without invoking OOM. Suspend also disables
1983 * storage devices so kswapd will not help. Bail if we are suspending. 1983 * storage devices so kswapd will not help. Bail if we are suspending.
1984 */ 1984 */
1985 if (!did_some_progress && pm_suspended_storage()) 1985 if (!did_some_progress && pm_suspended_storage())
1986 return 0; 1986 return 0;
1987 1987
1988 /* 1988 /*
1989 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1989 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1990 * means __GFP_NOFAIL, but that may not be true in other 1990 * means __GFP_NOFAIL, but that may not be true in other
1991 * implementations. 1991 * implementations.
1992 */ 1992 */
1993 if (order <= PAGE_ALLOC_COSTLY_ORDER) 1993 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1994 return 1; 1994 return 1;
1995 1995
1996 /* 1996 /*
1997 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 1997 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1998 * specified, then we retry until we no longer reclaim any pages 1998 * specified, then we retry until we no longer reclaim any pages
1999 * (above), or we've reclaimed an order of pages at least as 1999 * (above), or we've reclaimed an order of pages at least as
2000 * large as the allocation's order. In both cases, if the 2000 * large as the allocation's order. In both cases, if the
2001 * allocation still fails, we stop retrying. 2001 * allocation still fails, we stop retrying.
2002 */ 2002 */
2003 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2003 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2004 return 1; 2004 return 1;
2005 2005
2006 return 0; 2006 return 0;
2007 } 2007 }
2008 2008
2009 static inline struct page * 2009 static inline struct page *
2010 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2010 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2011 struct zonelist *zonelist, enum zone_type high_zoneidx, 2011 struct zonelist *zonelist, enum zone_type high_zoneidx,
2012 nodemask_t *nodemask, struct zone *preferred_zone, 2012 nodemask_t *nodemask, struct zone *preferred_zone,
2013 int migratetype) 2013 int migratetype)
2014 { 2014 {
2015 struct page *page; 2015 struct page *page;
2016 2016
2017 /* Acquire the OOM killer lock for the zones in zonelist */ 2017 /* Acquire the OOM killer lock for the zones in zonelist */
2018 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2018 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2019 schedule_timeout_uninterruptible(1); 2019 schedule_timeout_uninterruptible(1);
2020 return NULL; 2020 return NULL;
2021 } 2021 }
2022 2022
2023 /* 2023 /*
2024 * Go through the zonelist yet one more time, keep very high watermark 2024 * Go through the zonelist yet one more time, keep very high watermark
2025 * here, this is only to catch a parallel oom killing, we must fail if 2025 * here, this is only to catch a parallel oom killing, we must fail if
2026 * we're still under heavy pressure. 2026 * we're still under heavy pressure.
2027 */ 2027 */
2028 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2028 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2029 order, zonelist, high_zoneidx, 2029 order, zonelist, high_zoneidx,
2030 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2030 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2031 preferred_zone, migratetype); 2031 preferred_zone, migratetype);
2032 if (page) 2032 if (page)
2033 goto out; 2033 goto out;
2034 2034
2035 if (!(gfp_mask & __GFP_NOFAIL)) { 2035 if (!(gfp_mask & __GFP_NOFAIL)) {
2036 /* The OOM killer will not help higher order allocs */ 2036 /* The OOM killer will not help higher order allocs */
2037 if (order > PAGE_ALLOC_COSTLY_ORDER) 2037 if (order > PAGE_ALLOC_COSTLY_ORDER)
2038 goto out; 2038 goto out;
2039 /* The OOM killer does not needlessly kill tasks for lowmem */ 2039 /* The OOM killer does not needlessly kill tasks for lowmem */
2040 if (high_zoneidx < ZONE_NORMAL) 2040 if (high_zoneidx < ZONE_NORMAL)
2041 goto out; 2041 goto out;
2042 /* 2042 /*
2043 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2043 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2044 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2044 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2045 * The caller should handle page allocation failure by itself if 2045 * The caller should handle page allocation failure by itself if
2046 * it specifies __GFP_THISNODE. 2046 * it specifies __GFP_THISNODE.
2047 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2047 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2048 */ 2048 */
2049 if (gfp_mask & __GFP_THISNODE) 2049 if (gfp_mask & __GFP_THISNODE)
2050 goto out; 2050 goto out;
2051 } 2051 }
2052 /* Exhausted what can be done so it's blamo time */ 2052 /* Exhausted what can be done so it's blamo time */
2053 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2053 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2054 2054
2055 out: 2055 out:
2056 clear_zonelist_oom(zonelist, gfp_mask); 2056 clear_zonelist_oom(zonelist, gfp_mask);
2057 return page; 2057 return page;
2058 } 2058 }
2059 2059
2060 #ifdef CONFIG_COMPACTION 2060 #ifdef CONFIG_COMPACTION
2061 /* Try memory compaction for high-order allocations before reclaim */ 2061 /* Try memory compaction for high-order allocations before reclaim */
2062 static struct page * 2062 static struct page *
2063 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2063 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2064 struct zonelist *zonelist, enum zone_type high_zoneidx, 2064 struct zonelist *zonelist, enum zone_type high_zoneidx,
2065 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2065 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2066 int migratetype, bool sync_migration, 2066 int migratetype, bool sync_migration,
2067 bool *deferred_compaction, 2067 bool *deferred_compaction,
2068 unsigned long *did_some_progress) 2068 unsigned long *did_some_progress)
2069 { 2069 {
2070 struct page *page; 2070 struct page *page;
2071 2071
2072 if (!order) 2072 if (!order)
2073 return NULL; 2073 return NULL;
2074 2074
2075 if (compaction_deferred(preferred_zone, order)) { 2075 if (compaction_deferred(preferred_zone, order)) {
2076 *deferred_compaction = true; 2076 *deferred_compaction = true;
2077 return NULL; 2077 return NULL;
2078 } 2078 }
2079 2079
2080 current->flags |= PF_MEMALLOC; 2080 current->flags |= PF_MEMALLOC;
2081 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2081 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2082 nodemask, sync_migration); 2082 nodemask, sync_migration);
2083 current->flags &= ~PF_MEMALLOC; 2083 current->flags &= ~PF_MEMALLOC;
2084 if (*did_some_progress != COMPACT_SKIPPED) { 2084 if (*did_some_progress != COMPACT_SKIPPED) {
2085 2085
2086 /* Page migration frees to the PCP lists but we want merging */ 2086 /* Page migration frees to the PCP lists but we want merging */
2087 drain_pages(get_cpu()); 2087 drain_pages(get_cpu());
2088 put_cpu(); 2088 put_cpu();
2089 2089
2090 page = get_page_from_freelist(gfp_mask, nodemask, 2090 page = get_page_from_freelist(gfp_mask, nodemask,
2091 order, zonelist, high_zoneidx, 2091 order, zonelist, high_zoneidx,
2092 alloc_flags, preferred_zone, 2092 alloc_flags, preferred_zone,
2093 migratetype); 2093 migratetype);
2094 if (page) { 2094 if (page) {
2095 preferred_zone->compact_considered = 0; 2095 preferred_zone->compact_considered = 0;
2096 preferred_zone->compact_defer_shift = 0; 2096 preferred_zone->compact_defer_shift = 0;
2097 if (order >= preferred_zone->compact_order_failed) 2097 if (order >= preferred_zone->compact_order_failed)
2098 preferred_zone->compact_order_failed = order + 1; 2098 preferred_zone->compact_order_failed = order + 1;
2099 count_vm_event(COMPACTSUCCESS); 2099 count_vm_event(COMPACTSUCCESS);
2100 return page; 2100 return page;
2101 } 2101 }
2102 2102
2103 /* 2103 /*
2104 * It's bad if compaction run occurs and fails. 2104 * It's bad if compaction run occurs and fails.
2105 * The most likely reason is that pages exist, 2105 * The most likely reason is that pages exist,
2106 * but not enough to satisfy watermarks. 2106 * but not enough to satisfy watermarks.
2107 */ 2107 */
2108 count_vm_event(COMPACTFAIL); 2108 count_vm_event(COMPACTFAIL);
2109 2109
2110 /* 2110 /*
2111 * As async compaction considers a subset of pageblocks, only 2111 * As async compaction considers a subset of pageblocks, only
2112 * defer if the failure was a sync compaction failure. 2112 * defer if the failure was a sync compaction failure.
2113 */ 2113 */
2114 if (sync_migration) 2114 if (sync_migration)
2115 defer_compaction(preferred_zone, order); 2115 defer_compaction(preferred_zone, order);
2116 2116
2117 cond_resched(); 2117 cond_resched();
2118 } 2118 }
2119 2119
2120 return NULL; 2120 return NULL;
2121 } 2121 }
2122 #else 2122 #else
2123 static inline struct page * 2123 static inline struct page *
2124 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2124 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2125 struct zonelist *zonelist, enum zone_type high_zoneidx, 2125 struct zonelist *zonelist, enum zone_type high_zoneidx,
2126 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2126 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2127 int migratetype, bool sync_migration, 2127 int migratetype, bool sync_migration,
2128 bool *deferred_compaction, 2128 bool *deferred_compaction,
2129 unsigned long *did_some_progress) 2129 unsigned long *did_some_progress)
2130 { 2130 {
2131 return NULL; 2131 return NULL;
2132 } 2132 }
2133 #endif /* CONFIG_COMPACTION */ 2133 #endif /* CONFIG_COMPACTION */
2134 2134
2135 /* Perform direct synchronous page reclaim */ 2135 /* Perform direct synchronous page reclaim */
2136 static int 2136 static int
2137 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2137 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2138 nodemask_t *nodemask) 2138 nodemask_t *nodemask)
2139 { 2139 {
2140 struct reclaim_state reclaim_state; 2140 struct reclaim_state reclaim_state;
2141 int progress; 2141 int progress;
2142 2142
2143 cond_resched(); 2143 cond_resched();
2144 2144
2145 /* We now go into synchronous reclaim */ 2145 /* We now go into synchronous reclaim */
2146 cpuset_memory_pressure_bump(); 2146 cpuset_memory_pressure_bump();
2147 current->flags |= PF_MEMALLOC; 2147 current->flags |= PF_MEMALLOC;
2148 lockdep_set_current_reclaim_state(gfp_mask); 2148 lockdep_set_current_reclaim_state(gfp_mask);
2149 reclaim_state.reclaimed_slab = 0; 2149 reclaim_state.reclaimed_slab = 0;
2150 current->reclaim_state = &reclaim_state; 2150 current->reclaim_state = &reclaim_state;
2151 2151
2152 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2152 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2153 2153
2154 current->reclaim_state = NULL; 2154 current->reclaim_state = NULL;
2155 lockdep_clear_current_reclaim_state(); 2155 lockdep_clear_current_reclaim_state();
2156 current->flags &= ~PF_MEMALLOC; 2156 current->flags &= ~PF_MEMALLOC;
2157 2157
2158 cond_resched(); 2158 cond_resched();
2159 2159
2160 return progress; 2160 return progress;
2161 } 2161 }
2162 2162
2163 /* The really slow allocator path where we enter direct reclaim */ 2163 /* The really slow allocator path where we enter direct reclaim */
2164 static inline struct page * 2164 static inline struct page *
2165 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2165 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2166 struct zonelist *zonelist, enum zone_type high_zoneidx, 2166 struct zonelist *zonelist, enum zone_type high_zoneidx,
2167 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2167 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2168 int migratetype, unsigned long *did_some_progress) 2168 int migratetype, unsigned long *did_some_progress)
2169 { 2169 {
2170 struct page *page = NULL; 2170 struct page *page = NULL;
2171 bool drained = false; 2171 bool drained = false;
2172 2172
2173 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2173 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2174 nodemask); 2174 nodemask);
2175 if (unlikely(!(*did_some_progress))) 2175 if (unlikely(!(*did_some_progress)))
2176 return NULL; 2176 return NULL;
2177 2177
2178 /* After successful reclaim, reconsider all zones for allocation */ 2178 /* After successful reclaim, reconsider all zones for allocation */
2179 if (NUMA_BUILD) 2179 if (NUMA_BUILD)
2180 zlc_clear_zones_full(zonelist); 2180 zlc_clear_zones_full(zonelist);
2181 2181
2182 retry: 2182 retry:
2183 page = get_page_from_freelist(gfp_mask, nodemask, order, 2183 page = get_page_from_freelist(gfp_mask, nodemask, order,
2184 zonelist, high_zoneidx, 2184 zonelist, high_zoneidx,
2185 alloc_flags, preferred_zone, 2185 alloc_flags, preferred_zone,
2186 migratetype); 2186 migratetype);
2187 2187
2188 /* 2188 /*
2189 * If an allocation failed after direct reclaim, it could be because 2189 * If an allocation failed after direct reclaim, it could be because
2190 * pages are pinned on the per-cpu lists. Drain them and try again 2190 * pages are pinned on the per-cpu lists. Drain them and try again
2191 */ 2191 */
2192 if (!page && !drained) { 2192 if (!page && !drained) {
2193 drain_all_pages(); 2193 drain_all_pages();
2194 drained = true; 2194 drained = true;
2195 goto retry; 2195 goto retry;
2196 } 2196 }
2197 2197
2198 return page; 2198 return page;
2199 } 2199 }
2200 2200
2201 /* 2201 /*
2202 * This is called in the allocator slow-path if the allocation request is of 2202 * This is called in the allocator slow-path if the allocation request is of
2203 * sufficient urgency to ignore watermarks and take other desperate measures 2203 * sufficient urgency to ignore watermarks and take other desperate measures
2204 */ 2204 */
2205 static inline struct page * 2205 static inline struct page *
2206 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2206 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2207 struct zonelist *zonelist, enum zone_type high_zoneidx, 2207 struct zonelist *zonelist, enum zone_type high_zoneidx,
2208 nodemask_t *nodemask, struct zone *preferred_zone, 2208 nodemask_t *nodemask, struct zone *preferred_zone,
2209 int migratetype) 2209 int migratetype)
2210 { 2210 {
2211 struct page *page; 2211 struct page *page;
2212 2212
2213 do { 2213 do {
2214 page = get_page_from_freelist(gfp_mask, nodemask, order, 2214 page = get_page_from_freelist(gfp_mask, nodemask, order,
2215 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2215 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2216 preferred_zone, migratetype); 2216 preferred_zone, migratetype);
2217 2217
2218 if (!page && gfp_mask & __GFP_NOFAIL) 2218 if (!page && gfp_mask & __GFP_NOFAIL)
2219 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2219 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2220 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2220 } while (!page && (gfp_mask & __GFP_NOFAIL));
2221 2221
2222 return page; 2222 return page;
2223 } 2223 }
2224 2224
2225 static inline 2225 static inline
2226 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2226 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2227 enum zone_type high_zoneidx, 2227 enum zone_type high_zoneidx,
2228 enum zone_type classzone_idx) 2228 enum zone_type classzone_idx)
2229 { 2229 {
2230 struct zoneref *z; 2230 struct zoneref *z;
2231 struct zone *zone; 2231 struct zone *zone;
2232 2232
2233 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2233 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2234 wakeup_kswapd(zone, order, classzone_idx); 2234 wakeup_kswapd(zone, order, classzone_idx);
2235 } 2235 }
2236 2236
2237 static inline int 2237 static inline int
2238 gfp_to_alloc_flags(gfp_t gfp_mask) 2238 gfp_to_alloc_flags(gfp_t gfp_mask)
2239 { 2239 {
2240 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2240 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2241 const gfp_t wait = gfp_mask & __GFP_WAIT; 2241 const gfp_t wait = gfp_mask & __GFP_WAIT;
2242 2242
2243 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2243 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2244 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2244 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2245 2245
2246 /* 2246 /*
2247 * The caller may dip into page reserves a bit more if the caller 2247 * The caller may dip into page reserves a bit more if the caller
2248 * cannot run direct reclaim, or if the caller has realtime scheduling 2248 * cannot run direct reclaim, or if the caller has realtime scheduling
2249 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2249 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2250 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2250 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
2251 */ 2251 */
2252 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2252 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2253 2253
2254 if (!wait) { 2254 if (!wait) {
2255 /* 2255 /*
2256 * Not worth trying to allocate harder for 2256 * Not worth trying to allocate harder for
2257 * __GFP_NOMEMALLOC even if it can't schedule. 2257 * __GFP_NOMEMALLOC even if it can't schedule.
2258 */ 2258 */
2259 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2259 if (!(gfp_mask & __GFP_NOMEMALLOC))
2260 alloc_flags |= ALLOC_HARDER; 2260 alloc_flags |= ALLOC_HARDER;
2261 /* 2261 /*
2262 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2262 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
2263 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2263 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
2264 */ 2264 */
2265 alloc_flags &= ~ALLOC_CPUSET; 2265 alloc_flags &= ~ALLOC_CPUSET;
2266 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2266 } else if (unlikely(rt_task(current)) && !in_interrupt())
2267 alloc_flags |= ALLOC_HARDER; 2267 alloc_flags |= ALLOC_HARDER;
2268 2268
2269 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2269 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2270 if (!in_interrupt() && 2270 if (!in_interrupt() &&
2271 ((current->flags & PF_MEMALLOC) || 2271 ((current->flags & PF_MEMALLOC) ||
2272 unlikely(test_thread_flag(TIF_MEMDIE)))) 2272 unlikely(test_thread_flag(TIF_MEMDIE))))
2273 alloc_flags |= ALLOC_NO_WATERMARKS; 2273 alloc_flags |= ALLOC_NO_WATERMARKS;
2274 } 2274 }
2275 2275
2276 return alloc_flags; 2276 return alloc_flags;
2277 } 2277 }
2278 2278
2279 static inline struct page * 2279 static inline struct page *
2280 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2280 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2281 struct zonelist *zonelist, enum zone_type high_zoneidx, 2281 struct zonelist *zonelist, enum zone_type high_zoneidx,
2282 nodemask_t *nodemask, struct zone *preferred_zone, 2282 nodemask_t *nodemask, struct zone *preferred_zone,
2283 int migratetype) 2283 int migratetype)
2284 { 2284 {
2285 const gfp_t wait = gfp_mask & __GFP_WAIT; 2285 const gfp_t wait = gfp_mask & __GFP_WAIT;
2286 struct page *page = NULL; 2286 struct page *page = NULL;
2287 int alloc_flags; 2287 int alloc_flags;
2288 unsigned long pages_reclaimed = 0; 2288 unsigned long pages_reclaimed = 0;
2289 unsigned long did_some_progress; 2289 unsigned long did_some_progress;
2290 bool sync_migration = false; 2290 bool sync_migration = false;
2291 bool deferred_compaction = false; 2291 bool deferred_compaction = false;
2292 2292
2293 /* 2293 /*
2294 * In the slowpath, we sanity check order to avoid ever trying to 2294 * In the slowpath, we sanity check order to avoid ever trying to
2295 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2295 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2296 * be using allocators in order of preference for an area that is 2296 * be using allocators in order of preference for an area that is
2297 * too large. 2297 * too large.
2298 */ 2298 */
2299 if (order >= MAX_ORDER) { 2299 if (order >= MAX_ORDER) {
2300 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2300 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2301 return NULL; 2301 return NULL;
2302 } 2302 }
2303 2303
2304 /* 2304 /*
2305 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2305 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2306 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2306 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2307 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2307 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2308 * using a larger set of nodes after it has established that the 2308 * using a larger set of nodes after it has established that the
2309 * allowed per node queues are empty and that nodes are 2309 * allowed per node queues are empty and that nodes are
2310 * over allocated. 2310 * over allocated.
2311 */ 2311 */
2312 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2312 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2313 goto nopage; 2313 goto nopage;
2314 2314
2315 restart: 2315 restart:
2316 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2316 if (!(gfp_mask & __GFP_NO_KSWAPD))
2317 wake_all_kswapd(order, zonelist, high_zoneidx, 2317 wake_all_kswapd(order, zonelist, high_zoneidx,
2318 zone_idx(preferred_zone)); 2318 zone_idx(preferred_zone));
2319 2319
2320 /* 2320 /*
2321 * OK, we're below the kswapd watermark and have kicked background 2321 * OK, we're below the kswapd watermark and have kicked background
2322 * reclaim. Now things get more complex, so set up alloc_flags according 2322 * reclaim. Now things get more complex, so set up alloc_flags according
2323 * to how we want to proceed. 2323 * to how we want to proceed.
2324 */ 2324 */
2325 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2325 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2326 2326
2327 /* 2327 /*
2328 * Find the true preferred zone if the allocation is unconstrained by 2328 * Find the true preferred zone if the allocation is unconstrained by
2329 * cpusets. 2329 * cpusets.
2330 */ 2330 */
2331 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2331 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2332 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2332 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2333 &preferred_zone); 2333 &preferred_zone);
2334 2334
2335 rebalance: 2335 rebalance:
2336 /* This is the last chance, in general, before the goto nopage. */ 2336 /* This is the last chance, in general, before the goto nopage. */
2337 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2337 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2338 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2338 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2339 preferred_zone, migratetype); 2339 preferred_zone, migratetype);
2340 if (page) 2340 if (page)
2341 goto got_pg; 2341 goto got_pg;
2342 2342
2343 /* Allocate without watermarks if the context allows */ 2343 /* Allocate without watermarks if the context allows */
2344 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2344 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2345 page = __alloc_pages_high_priority(gfp_mask, order, 2345 page = __alloc_pages_high_priority(gfp_mask, order,
2346 zonelist, high_zoneidx, nodemask, 2346 zonelist, high_zoneidx, nodemask,
2347 preferred_zone, migratetype); 2347 preferred_zone, migratetype);
2348 if (page) 2348 if (page)
2349 goto got_pg; 2349 goto got_pg;
2350 } 2350 }
2351 2351
2352 /* Atomic allocations - we can't balance anything */ 2352 /* Atomic allocations - we can't balance anything */
2353 if (!wait) 2353 if (!wait)
2354 goto nopage; 2354 goto nopage;
2355 2355
2356 /* Avoid recursion of direct reclaim */ 2356 /* Avoid recursion of direct reclaim */
2357 if (current->flags & PF_MEMALLOC) 2357 if (current->flags & PF_MEMALLOC)
2358 goto nopage; 2358 goto nopage;
2359 2359
2360 /* Avoid allocations with no watermarks from looping endlessly */ 2360 /* Avoid allocations with no watermarks from looping endlessly */
2361 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2361 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2362 goto nopage; 2362 goto nopage;
2363 2363
2364 /* 2364 /*
2365 * Try direct compaction. The first pass is asynchronous. Subsequent 2365 * Try direct compaction. The first pass is asynchronous. Subsequent
2366 * attempts after direct reclaim are synchronous 2366 * attempts after direct reclaim are synchronous
2367 */ 2367 */
2368 page = __alloc_pages_direct_compact(gfp_mask, order, 2368 page = __alloc_pages_direct_compact(gfp_mask, order,
2369 zonelist, high_zoneidx, 2369 zonelist, high_zoneidx,
2370 nodemask, 2370 nodemask,
2371 alloc_flags, preferred_zone, 2371 alloc_flags, preferred_zone,
2372 migratetype, sync_migration, 2372 migratetype, sync_migration,
2373 &deferred_compaction, 2373 &deferred_compaction,
2374 &did_some_progress); 2374 &did_some_progress);
2375 if (page) 2375 if (page)
2376 goto got_pg; 2376 goto got_pg;
2377 sync_migration = true; 2377 sync_migration = true;
2378 2378
2379 /* 2379 /*
2380 * If compaction is deferred for high-order allocations, it is because 2380 * If compaction is deferred for high-order allocations, it is because
2381 * sync compaction recently failed. In this is the case and the caller 2381 * sync compaction recently failed. In this is the case and the caller
2382 * has requested the system not be heavily disrupted, fail the 2382 * has requested the system not be heavily disrupted, fail the
2383 * allocation now instead of entering direct reclaim 2383 * allocation now instead of entering direct reclaim
2384 */ 2384 */
2385 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) 2385 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2386 goto nopage; 2386 goto nopage;
2387 2387
2388 /* Try direct reclaim and then allocating */ 2388 /* Try direct reclaim and then allocating */
2389 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2389 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2390 zonelist, high_zoneidx, 2390 zonelist, high_zoneidx,
2391 nodemask, 2391 nodemask,
2392 alloc_flags, preferred_zone, 2392 alloc_flags, preferred_zone,
2393 migratetype, &did_some_progress); 2393 migratetype, &did_some_progress);
2394 if (page) 2394 if (page)
2395 goto got_pg; 2395 goto got_pg;
2396 2396
2397 /* 2397 /*
2398 * If we failed to make any progress reclaiming, then we are 2398 * If we failed to make any progress reclaiming, then we are
2399 * running out of options and have to consider going OOM 2399 * running out of options and have to consider going OOM
2400 */ 2400 */
2401 if (!did_some_progress) { 2401 if (!did_some_progress) {
2402 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2402 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2403 if (oom_killer_disabled) 2403 if (oom_killer_disabled)
2404 goto nopage; 2404 goto nopage;
2405 /* Coredumps can quickly deplete all memory reserves */ 2405 /* Coredumps can quickly deplete all memory reserves */
2406 if ((current->flags & PF_DUMPCORE) && 2406 if ((current->flags & PF_DUMPCORE) &&
2407 !(gfp_mask & __GFP_NOFAIL)) 2407 !(gfp_mask & __GFP_NOFAIL))
2408 goto nopage; 2408 goto nopage;
2409 page = __alloc_pages_may_oom(gfp_mask, order, 2409 page = __alloc_pages_may_oom(gfp_mask, order,
2410 zonelist, high_zoneidx, 2410 zonelist, high_zoneidx,
2411 nodemask, preferred_zone, 2411 nodemask, preferred_zone,
2412 migratetype); 2412 migratetype);
2413 if (page) 2413 if (page)
2414 goto got_pg; 2414 goto got_pg;
2415 2415
2416 if (!(gfp_mask & __GFP_NOFAIL)) { 2416 if (!(gfp_mask & __GFP_NOFAIL)) {
2417 /* 2417 /*
2418 * The oom killer is not called for high-order 2418 * The oom killer is not called for high-order
2419 * allocations that may fail, so if no progress 2419 * allocations that may fail, so if no progress
2420 * is being made, there are no other options and 2420 * is being made, there are no other options and
2421 * retrying is unlikely to help. 2421 * retrying is unlikely to help.
2422 */ 2422 */
2423 if (order > PAGE_ALLOC_COSTLY_ORDER) 2423 if (order > PAGE_ALLOC_COSTLY_ORDER)
2424 goto nopage; 2424 goto nopage;
2425 /* 2425 /*
2426 * The oom killer is not called for lowmem 2426 * The oom killer is not called for lowmem
2427 * allocations to prevent needlessly killing 2427 * allocations to prevent needlessly killing
2428 * innocent tasks. 2428 * innocent tasks.
2429 */ 2429 */
2430 if (high_zoneidx < ZONE_NORMAL) 2430 if (high_zoneidx < ZONE_NORMAL)
2431 goto nopage; 2431 goto nopage;
2432 } 2432 }
2433 2433
2434 goto restart; 2434 goto restart;
2435 } 2435 }
2436 } 2436 }
2437 2437
2438 /* Check if we should retry the allocation */ 2438 /* Check if we should retry the allocation */
2439 pages_reclaimed += did_some_progress; 2439 pages_reclaimed += did_some_progress;
2440 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2440 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2441 pages_reclaimed)) { 2441 pages_reclaimed)) {
2442 /* Wait for some write requests to complete then retry */ 2442 /* Wait for some write requests to complete then retry */
2443 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2443 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2444 goto rebalance; 2444 goto rebalance;
2445 } else { 2445 } else {
2446 /* 2446 /*
2447 * High-order allocations do not necessarily loop after 2447 * High-order allocations do not necessarily loop after
2448 * direct reclaim and reclaim/compaction depends on compaction 2448 * direct reclaim and reclaim/compaction depends on compaction
2449 * being called after reclaim so call directly if necessary 2449 * being called after reclaim so call directly if necessary
2450 */ 2450 */
2451 page = __alloc_pages_direct_compact(gfp_mask, order, 2451 page = __alloc_pages_direct_compact(gfp_mask, order,
2452 zonelist, high_zoneidx, 2452 zonelist, high_zoneidx,
2453 nodemask, 2453 nodemask,
2454 alloc_flags, preferred_zone, 2454 alloc_flags, preferred_zone,
2455 migratetype, sync_migration, 2455 migratetype, sync_migration,
2456 &deferred_compaction, 2456 &deferred_compaction,
2457 &did_some_progress); 2457 &did_some_progress);
2458 if (page) 2458 if (page)
2459 goto got_pg; 2459 goto got_pg;
2460 } 2460 }
2461 2461
2462 nopage: 2462 nopage:
2463 warn_alloc_failed(gfp_mask, order, NULL); 2463 warn_alloc_failed(gfp_mask, order, NULL);
2464 return page; 2464 return page;
2465 got_pg: 2465 got_pg:
2466 if (kmemcheck_enabled) 2466 if (kmemcheck_enabled)
2467 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2467 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2468 return page; 2468 return page;
2469 2469
2470 } 2470 }
2471 2471
2472 /* 2472 /*
2473 * This is the 'heart' of the zoned buddy allocator. 2473 * This is the 'heart' of the zoned buddy allocator.
2474 */ 2474 */
2475 struct page * 2475 struct page *
2476 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2476 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2477 struct zonelist *zonelist, nodemask_t *nodemask) 2477 struct zonelist *zonelist, nodemask_t *nodemask)
2478 { 2478 {
2479 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2479 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2480 struct zone *preferred_zone; 2480 struct zone *preferred_zone;
2481 struct page *page = NULL; 2481 struct page *page = NULL;
2482 int migratetype = allocflags_to_migratetype(gfp_mask); 2482 int migratetype = allocflags_to_migratetype(gfp_mask);
2483 unsigned int cpuset_mems_cookie; 2483 unsigned int cpuset_mems_cookie;
2484 2484
2485 gfp_mask &= gfp_allowed_mask; 2485 gfp_mask &= gfp_allowed_mask;
2486 2486
2487 lockdep_trace_alloc(gfp_mask); 2487 lockdep_trace_alloc(gfp_mask);
2488 2488
2489 might_sleep_if(gfp_mask & __GFP_WAIT); 2489 might_sleep_if(gfp_mask & __GFP_WAIT);
2490 2490
2491 if (should_fail_alloc_page(gfp_mask, order)) 2491 if (should_fail_alloc_page(gfp_mask, order))
2492 return NULL; 2492 return NULL;
2493 2493
2494 /* 2494 /*
2495 * Check the zones suitable for the gfp_mask contain at least one 2495 * Check the zones suitable for the gfp_mask contain at least one
2496 * valid zone. It's possible to have an empty zonelist as a result 2496 * valid zone. It's possible to have an empty zonelist as a result
2497 * of GFP_THISNODE and a memoryless node 2497 * of GFP_THISNODE and a memoryless node
2498 */ 2498 */
2499 if (unlikely(!zonelist->_zonerefs->zone)) 2499 if (unlikely(!zonelist->_zonerefs->zone))
2500 return NULL; 2500 return NULL;
2501 2501
2502 retry_cpuset: 2502 retry_cpuset:
2503 cpuset_mems_cookie = get_mems_allowed(); 2503 cpuset_mems_cookie = get_mems_allowed();
2504 2504
2505 /* The preferred zone is used for statistics later */ 2505 /* The preferred zone is used for statistics later */
2506 first_zones_zonelist(zonelist, high_zoneidx, 2506 first_zones_zonelist(zonelist, high_zoneidx,
2507 nodemask ? : &cpuset_current_mems_allowed, 2507 nodemask ? : &cpuset_current_mems_allowed,
2508 &preferred_zone); 2508 &preferred_zone);
2509 if (!preferred_zone) 2509 if (!preferred_zone)
2510 goto out; 2510 goto out;
2511 2511
2512 /* First allocation attempt */ 2512 /* First allocation attempt */
2513 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2513 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2514 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 2514 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
2515 preferred_zone, migratetype); 2515 preferred_zone, migratetype);
2516 if (unlikely(!page)) 2516 if (unlikely(!page))
2517 page = __alloc_pages_slowpath(gfp_mask, order, 2517 page = __alloc_pages_slowpath(gfp_mask, order,
2518 zonelist, high_zoneidx, nodemask, 2518 zonelist, high_zoneidx, nodemask,
2519 preferred_zone, migratetype); 2519 preferred_zone, migratetype);
2520 2520
2521 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2521 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2522 2522
2523 out: 2523 out:
2524 /* 2524 /*
2525 * When updating a task's mems_allowed, it is possible to race with 2525 * When updating a task's mems_allowed, it is possible to race with
2526 * parallel threads in such a way that an allocation can fail while 2526 * parallel threads in such a way that an allocation can fail while
2527 * the mask is being updated. If a page allocation is about to fail, 2527 * the mask is being updated. If a page allocation is about to fail,
2528 * check if the cpuset changed during allocation and if so, retry. 2528 * check if the cpuset changed during allocation and if so, retry.
2529 */ 2529 */
2530 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2530 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2531 goto retry_cpuset; 2531 goto retry_cpuset;
2532 2532
2533 return page; 2533 return page;
2534 } 2534 }
2535 EXPORT_SYMBOL(__alloc_pages_nodemask); 2535 EXPORT_SYMBOL(__alloc_pages_nodemask);
2536 2536
2537 /* 2537 /*
2538 * Common helper functions. 2538 * Common helper functions.
2539 */ 2539 */
2540 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2540 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2541 { 2541 {
2542 struct page *page; 2542 struct page *page;
2543 2543
2544 /* 2544 /*
2545 * __get_free_pages() returns a 32-bit address, which cannot represent 2545 * __get_free_pages() returns a 32-bit address, which cannot represent
2546 * a highmem page 2546 * a highmem page
2547 */ 2547 */
2548 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2548 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2549 2549
2550 page = alloc_pages(gfp_mask, order); 2550 page = alloc_pages(gfp_mask, order);
2551 if (!page) 2551 if (!page)
2552 return 0; 2552 return 0;
2553 return (unsigned long) page_address(page); 2553 return (unsigned long) page_address(page);
2554 } 2554 }
2555 EXPORT_SYMBOL(__get_free_pages); 2555 EXPORT_SYMBOL(__get_free_pages);
2556 2556
2557 unsigned long get_zeroed_page(gfp_t gfp_mask) 2557 unsigned long get_zeroed_page(gfp_t gfp_mask)
2558 { 2558 {
2559 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2559 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2560 } 2560 }
2561 EXPORT_SYMBOL(get_zeroed_page); 2561 EXPORT_SYMBOL(get_zeroed_page);
2562 2562
2563 void __free_pages(struct page *page, unsigned int order) 2563 void __free_pages(struct page *page, unsigned int order)
2564 { 2564 {
2565 if (put_page_testzero(page)) { 2565 if (put_page_testzero(page)) {
2566 if (order == 0) 2566 if (order == 0)
2567 free_hot_cold_page(page, 0); 2567 free_hot_cold_page(page, 0);
2568 else 2568 else
2569 __free_pages_ok(page, order); 2569 __free_pages_ok(page, order);
2570 } 2570 }
2571 } 2571 }
2572 2572
2573 EXPORT_SYMBOL(__free_pages); 2573 EXPORT_SYMBOL(__free_pages);
2574 2574
2575 void free_pages(unsigned long addr, unsigned int order) 2575 void free_pages(unsigned long addr, unsigned int order)
2576 { 2576 {
2577 if (addr != 0) { 2577 if (addr != 0) {
2578 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2578 VM_BUG_ON(!virt_addr_valid((void *)addr));
2579 __free_pages(virt_to_page((void *)addr), order); 2579 __free_pages(virt_to_page((void *)addr), order);
2580 } 2580 }
2581 } 2581 }
2582 2582
2583 EXPORT_SYMBOL(free_pages); 2583 EXPORT_SYMBOL(free_pages);
2584 2584
2585 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2585 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2586 { 2586 {
2587 if (addr) { 2587 if (addr) {
2588 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2588 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2589 unsigned long used = addr + PAGE_ALIGN(size); 2589 unsigned long used = addr + PAGE_ALIGN(size);
2590 2590
2591 split_page(virt_to_page((void *)addr), order); 2591 split_page(virt_to_page((void *)addr), order);
2592 while (used < alloc_end) { 2592 while (used < alloc_end) {
2593 free_page(used); 2593 free_page(used);
2594 used += PAGE_SIZE; 2594 used += PAGE_SIZE;
2595 } 2595 }
2596 } 2596 }
2597 return (void *)addr; 2597 return (void *)addr;
2598 } 2598 }
2599 2599
2600 /** 2600 /**
2601 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2601 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2602 * @size: the number of bytes to allocate 2602 * @size: the number of bytes to allocate
2603 * @gfp_mask: GFP flags for the allocation 2603 * @gfp_mask: GFP flags for the allocation
2604 * 2604 *
2605 * This function is similar to alloc_pages(), except that it allocates the 2605 * This function is similar to alloc_pages(), except that it allocates the
2606 * minimum number of pages to satisfy the request. alloc_pages() can only 2606 * minimum number of pages to satisfy the request. alloc_pages() can only
2607 * allocate memory in power-of-two pages. 2607 * allocate memory in power-of-two pages.
2608 * 2608 *
2609 * This function is also limited by MAX_ORDER. 2609 * This function is also limited by MAX_ORDER.
2610 * 2610 *
2611 * Memory allocated by this function must be released by free_pages_exact(). 2611 * Memory allocated by this function must be released by free_pages_exact().
2612 */ 2612 */
2613 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2613 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2614 { 2614 {
2615 unsigned int order = get_order(size); 2615 unsigned int order = get_order(size);
2616 unsigned long addr; 2616 unsigned long addr;
2617 2617
2618 addr = __get_free_pages(gfp_mask, order); 2618 addr = __get_free_pages(gfp_mask, order);
2619 return make_alloc_exact(addr, order, size); 2619 return make_alloc_exact(addr, order, size);
2620 } 2620 }
2621 EXPORT_SYMBOL(alloc_pages_exact); 2621 EXPORT_SYMBOL(alloc_pages_exact);
2622 2622
2623 /** 2623 /**
2624 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2624 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2625 * pages on a node. 2625 * pages on a node.
2626 * @nid: the preferred node ID where memory should be allocated 2626 * @nid: the preferred node ID where memory should be allocated
2627 * @size: the number of bytes to allocate 2627 * @size: the number of bytes to allocate
2628 * @gfp_mask: GFP flags for the allocation 2628 * @gfp_mask: GFP flags for the allocation
2629 * 2629 *
2630 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2630 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2631 * back. 2631 * back.
2632 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2632 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2633 * but is not exact. 2633 * but is not exact.
2634 */ 2634 */
2635 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2635 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2636 { 2636 {
2637 unsigned order = get_order(size); 2637 unsigned order = get_order(size);
2638 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2638 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2639 if (!p) 2639 if (!p)
2640 return NULL; 2640 return NULL;
2641 return make_alloc_exact((unsigned long)page_address(p), order, size); 2641 return make_alloc_exact((unsigned long)page_address(p), order, size);
2642 } 2642 }
2643 EXPORT_SYMBOL(alloc_pages_exact_nid); 2643 EXPORT_SYMBOL(alloc_pages_exact_nid);
2644 2644
2645 /** 2645 /**
2646 * free_pages_exact - release memory allocated via alloc_pages_exact() 2646 * free_pages_exact - release memory allocated via alloc_pages_exact()
2647 * @virt: the value returned by alloc_pages_exact. 2647 * @virt: the value returned by alloc_pages_exact.
2648 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2648 * @size: size of allocation, same value as passed to alloc_pages_exact().
2649 * 2649 *
2650 * Release the memory allocated by a previous call to alloc_pages_exact. 2650 * Release the memory allocated by a previous call to alloc_pages_exact.
2651 */ 2651 */
2652 void free_pages_exact(void *virt, size_t size) 2652 void free_pages_exact(void *virt, size_t size)
2653 { 2653 {
2654 unsigned long addr = (unsigned long)virt; 2654 unsigned long addr = (unsigned long)virt;
2655 unsigned long end = addr + PAGE_ALIGN(size); 2655 unsigned long end = addr + PAGE_ALIGN(size);
2656 2656
2657 while (addr < end) { 2657 while (addr < end) {
2658 free_page(addr); 2658 free_page(addr);
2659 addr += PAGE_SIZE; 2659 addr += PAGE_SIZE;
2660 } 2660 }
2661 } 2661 }
2662 EXPORT_SYMBOL(free_pages_exact); 2662 EXPORT_SYMBOL(free_pages_exact);
2663 2663
2664 static unsigned int nr_free_zone_pages(int offset) 2664 static unsigned int nr_free_zone_pages(int offset)
2665 { 2665 {
2666 struct zoneref *z; 2666 struct zoneref *z;
2667 struct zone *zone; 2667 struct zone *zone;
2668 2668
2669 /* Just pick one node, since fallback list is circular */ 2669 /* Just pick one node, since fallback list is circular */
2670 unsigned int sum = 0; 2670 unsigned int sum = 0;
2671 2671
2672 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2672 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2673 2673
2674 for_each_zone_zonelist(zone, z, zonelist, offset) { 2674 for_each_zone_zonelist(zone, z, zonelist, offset) {
2675 unsigned long size = zone->present_pages; 2675 unsigned long size = zone->present_pages;
2676 unsigned long high = high_wmark_pages(zone); 2676 unsigned long high = high_wmark_pages(zone);
2677 if (size > high) 2677 if (size > high)
2678 sum += size - high; 2678 sum += size - high;
2679 } 2679 }
2680 2680
2681 return sum; 2681 return sum;
2682 } 2682 }
2683 2683
2684 /* 2684 /*
2685 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2685 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
2686 */ 2686 */
2687 unsigned int nr_free_buffer_pages(void) 2687 unsigned int nr_free_buffer_pages(void)
2688 { 2688 {
2689 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2689 return nr_free_zone_pages(gfp_zone(GFP_USER));
2690 } 2690 }
2691 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2691 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2692 2692
2693 /* 2693 /*
2694 * Amount of free RAM allocatable within all zones 2694 * Amount of free RAM allocatable within all zones
2695 */ 2695 */
2696 unsigned int nr_free_pagecache_pages(void) 2696 unsigned int nr_free_pagecache_pages(void)
2697 { 2697 {
2698 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2698 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2699 } 2699 }
2700 2700
2701 static inline void show_node(struct zone *zone) 2701 static inline void show_node(struct zone *zone)
2702 { 2702 {
2703 if (NUMA_BUILD) 2703 if (NUMA_BUILD)
2704 printk("Node %d ", zone_to_nid(zone)); 2704 printk("Node %d ", zone_to_nid(zone));
2705 } 2705 }
2706 2706
2707 void si_meminfo(struct sysinfo *val) 2707 void si_meminfo(struct sysinfo *val)
2708 { 2708 {
2709 val->totalram = totalram_pages; 2709 val->totalram = totalram_pages;
2710 val->sharedram = 0; 2710 val->sharedram = 0;
2711 val->freeram = global_page_state(NR_FREE_PAGES); 2711 val->freeram = global_page_state(NR_FREE_PAGES);
2712 val->bufferram = nr_blockdev_pages(); 2712 val->bufferram = nr_blockdev_pages();
2713 val->totalhigh = totalhigh_pages; 2713 val->totalhigh = totalhigh_pages;
2714 val->freehigh = nr_free_highpages(); 2714 val->freehigh = nr_free_highpages();
2715 val->mem_unit = PAGE_SIZE; 2715 val->mem_unit = PAGE_SIZE;
2716 } 2716 }
2717 2717
2718 EXPORT_SYMBOL(si_meminfo); 2718 EXPORT_SYMBOL(si_meminfo);
2719 2719
2720 #ifdef CONFIG_NUMA 2720 #ifdef CONFIG_NUMA
2721 void si_meminfo_node(struct sysinfo *val, int nid) 2721 void si_meminfo_node(struct sysinfo *val, int nid)
2722 { 2722 {
2723 pg_data_t *pgdat = NODE_DATA(nid); 2723 pg_data_t *pgdat = NODE_DATA(nid);
2724 2724
2725 val->totalram = pgdat->node_present_pages; 2725 val->totalram = pgdat->node_present_pages;
2726 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2726 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2727 #ifdef CONFIG_HIGHMEM 2727 #ifdef CONFIG_HIGHMEM
2728 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2728 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2729 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2729 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2730 NR_FREE_PAGES); 2730 NR_FREE_PAGES);
2731 #else 2731 #else
2732 val->totalhigh = 0; 2732 val->totalhigh = 0;
2733 val->freehigh = 0; 2733 val->freehigh = 0;
2734 #endif 2734 #endif
2735 val->mem_unit = PAGE_SIZE; 2735 val->mem_unit = PAGE_SIZE;
2736 } 2736 }
2737 #endif 2737 #endif
2738 2738
2739 /* 2739 /*
2740 * Determine whether the node should be displayed or not, depending on whether 2740 * Determine whether the node should be displayed or not, depending on whether
2741 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 2741 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2742 */ 2742 */
2743 bool skip_free_areas_node(unsigned int flags, int nid) 2743 bool skip_free_areas_node(unsigned int flags, int nid)
2744 { 2744 {
2745 bool ret = false; 2745 bool ret = false;
2746 unsigned int cpuset_mems_cookie; 2746 unsigned int cpuset_mems_cookie;
2747 2747
2748 if (!(flags & SHOW_MEM_FILTER_NODES)) 2748 if (!(flags & SHOW_MEM_FILTER_NODES))
2749 goto out; 2749 goto out;
2750 2750
2751 do { 2751 do {
2752 cpuset_mems_cookie = get_mems_allowed(); 2752 cpuset_mems_cookie = get_mems_allowed();
2753 ret = !node_isset(nid, cpuset_current_mems_allowed); 2753 ret = !node_isset(nid, cpuset_current_mems_allowed);
2754 } while (!put_mems_allowed(cpuset_mems_cookie)); 2754 } while (!put_mems_allowed(cpuset_mems_cookie));
2755 out: 2755 out:
2756 return ret; 2756 return ret;
2757 } 2757 }
2758 2758
2759 #define K(x) ((x) << (PAGE_SHIFT-10)) 2759 #define K(x) ((x) << (PAGE_SHIFT-10))
2760 2760
2761 /* 2761 /*
2762 * Show free area list (used inside shift_scroll-lock stuff) 2762 * Show free area list (used inside shift_scroll-lock stuff)
2763 * We also calculate the percentage fragmentation. We do this by counting the 2763 * We also calculate the percentage fragmentation. We do this by counting the
2764 * memory on each free list with the exception of the first item on the list. 2764 * memory on each free list with the exception of the first item on the list.
2765 * Suppresses nodes that are not allowed by current's cpuset if 2765 * Suppresses nodes that are not allowed by current's cpuset if
2766 * SHOW_MEM_FILTER_NODES is passed. 2766 * SHOW_MEM_FILTER_NODES is passed.
2767 */ 2767 */
2768 void show_free_areas(unsigned int filter) 2768 void show_free_areas(unsigned int filter)
2769 { 2769 {
2770 int cpu; 2770 int cpu;
2771 struct zone *zone; 2771 struct zone *zone;
2772 2772
2773 for_each_populated_zone(zone) { 2773 for_each_populated_zone(zone) {
2774 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2774 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2775 continue; 2775 continue;
2776 show_node(zone); 2776 show_node(zone);
2777 printk("%s per-cpu:\n", zone->name); 2777 printk("%s per-cpu:\n", zone->name);
2778 2778
2779 for_each_online_cpu(cpu) { 2779 for_each_online_cpu(cpu) {
2780 struct per_cpu_pageset *pageset; 2780 struct per_cpu_pageset *pageset;
2781 2781
2782 pageset = per_cpu_ptr(zone->pageset, cpu); 2782 pageset = per_cpu_ptr(zone->pageset, cpu);
2783 2783
2784 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2784 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2785 cpu, pageset->pcp.high, 2785 cpu, pageset->pcp.high,
2786 pageset->pcp.batch, pageset->pcp.count); 2786 pageset->pcp.batch, pageset->pcp.count);
2787 } 2787 }
2788 } 2788 }
2789 2789
2790 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2790 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2791 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2791 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2792 " unevictable:%lu" 2792 " unevictable:%lu"
2793 " dirty:%lu writeback:%lu unstable:%lu\n" 2793 " dirty:%lu writeback:%lu unstable:%lu\n"
2794 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2794 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2795 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2795 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2796 global_page_state(NR_ACTIVE_ANON), 2796 global_page_state(NR_ACTIVE_ANON),
2797 global_page_state(NR_INACTIVE_ANON), 2797 global_page_state(NR_INACTIVE_ANON),
2798 global_page_state(NR_ISOLATED_ANON), 2798 global_page_state(NR_ISOLATED_ANON),
2799 global_page_state(NR_ACTIVE_FILE), 2799 global_page_state(NR_ACTIVE_FILE),
2800 global_page_state(NR_INACTIVE_FILE), 2800 global_page_state(NR_INACTIVE_FILE),
2801 global_page_state(NR_ISOLATED_FILE), 2801 global_page_state(NR_ISOLATED_FILE),
2802 global_page_state(NR_UNEVICTABLE), 2802 global_page_state(NR_UNEVICTABLE),
2803 global_page_state(NR_FILE_DIRTY), 2803 global_page_state(NR_FILE_DIRTY),
2804 global_page_state(NR_WRITEBACK), 2804 global_page_state(NR_WRITEBACK),
2805 global_page_state(NR_UNSTABLE_NFS), 2805 global_page_state(NR_UNSTABLE_NFS),
2806 global_page_state(NR_FREE_PAGES), 2806 global_page_state(NR_FREE_PAGES),
2807 global_page_state(NR_SLAB_RECLAIMABLE), 2807 global_page_state(NR_SLAB_RECLAIMABLE),
2808 global_page_state(NR_SLAB_UNRECLAIMABLE), 2808 global_page_state(NR_SLAB_UNRECLAIMABLE),
2809 global_page_state(NR_FILE_MAPPED), 2809 global_page_state(NR_FILE_MAPPED),
2810 global_page_state(NR_SHMEM), 2810 global_page_state(NR_SHMEM),
2811 global_page_state(NR_PAGETABLE), 2811 global_page_state(NR_PAGETABLE),
2812 global_page_state(NR_BOUNCE)); 2812 global_page_state(NR_BOUNCE));
2813 2813
2814 for_each_populated_zone(zone) { 2814 for_each_populated_zone(zone) {
2815 int i; 2815 int i;
2816 2816
2817 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2817 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2818 continue; 2818 continue;
2819 show_node(zone); 2819 show_node(zone);
2820 printk("%s" 2820 printk("%s"
2821 " free:%lukB" 2821 " free:%lukB"
2822 " min:%lukB" 2822 " min:%lukB"
2823 " low:%lukB" 2823 " low:%lukB"
2824 " high:%lukB" 2824 " high:%lukB"
2825 " active_anon:%lukB" 2825 " active_anon:%lukB"
2826 " inactive_anon:%lukB" 2826 " inactive_anon:%lukB"
2827 " active_file:%lukB" 2827 " active_file:%lukB"
2828 " inactive_file:%lukB" 2828 " inactive_file:%lukB"
2829 " unevictable:%lukB" 2829 " unevictable:%lukB"
2830 " isolated(anon):%lukB" 2830 " isolated(anon):%lukB"
2831 " isolated(file):%lukB" 2831 " isolated(file):%lukB"
2832 " present:%lukB" 2832 " present:%lukB"
2833 " mlocked:%lukB" 2833 " mlocked:%lukB"
2834 " dirty:%lukB" 2834 " dirty:%lukB"
2835 " writeback:%lukB" 2835 " writeback:%lukB"
2836 " mapped:%lukB" 2836 " mapped:%lukB"
2837 " shmem:%lukB" 2837 " shmem:%lukB"
2838 " slab_reclaimable:%lukB" 2838 " slab_reclaimable:%lukB"
2839 " slab_unreclaimable:%lukB" 2839 " slab_unreclaimable:%lukB"
2840 " kernel_stack:%lukB" 2840 " kernel_stack:%lukB"
2841 " pagetables:%lukB" 2841 " pagetables:%lukB"
2842 " unstable:%lukB" 2842 " unstable:%lukB"
2843 " bounce:%lukB" 2843 " bounce:%lukB"
2844 " writeback_tmp:%lukB" 2844 " writeback_tmp:%lukB"
2845 " pages_scanned:%lu" 2845 " pages_scanned:%lu"
2846 " all_unreclaimable? %s" 2846 " all_unreclaimable? %s"
2847 "\n", 2847 "\n",
2848 zone->name, 2848 zone->name,
2849 K(zone_page_state(zone, NR_FREE_PAGES)), 2849 K(zone_page_state(zone, NR_FREE_PAGES)),
2850 K(min_wmark_pages(zone)), 2850 K(min_wmark_pages(zone)),
2851 K(low_wmark_pages(zone)), 2851 K(low_wmark_pages(zone)),
2852 K(high_wmark_pages(zone)), 2852 K(high_wmark_pages(zone)),
2853 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2853 K(zone_page_state(zone, NR_ACTIVE_ANON)),
2854 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2854 K(zone_page_state(zone, NR_INACTIVE_ANON)),
2855 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2855 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2856 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2856 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2857 K(zone_page_state(zone, NR_UNEVICTABLE)), 2857 K(zone_page_state(zone, NR_UNEVICTABLE)),
2858 K(zone_page_state(zone, NR_ISOLATED_ANON)), 2858 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2859 K(zone_page_state(zone, NR_ISOLATED_FILE)), 2859 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2860 K(zone->present_pages), 2860 K(zone->present_pages),
2861 K(zone_page_state(zone, NR_MLOCK)), 2861 K(zone_page_state(zone, NR_MLOCK)),
2862 K(zone_page_state(zone, NR_FILE_DIRTY)), 2862 K(zone_page_state(zone, NR_FILE_DIRTY)),
2863 K(zone_page_state(zone, NR_WRITEBACK)), 2863 K(zone_page_state(zone, NR_WRITEBACK)),
2864 K(zone_page_state(zone, NR_FILE_MAPPED)), 2864 K(zone_page_state(zone, NR_FILE_MAPPED)),
2865 K(zone_page_state(zone, NR_SHMEM)), 2865 K(zone_page_state(zone, NR_SHMEM)),
2866 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 2866 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2867 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 2867 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2868 zone_page_state(zone, NR_KERNEL_STACK) * 2868 zone_page_state(zone, NR_KERNEL_STACK) *
2869 THREAD_SIZE / 1024, 2869 THREAD_SIZE / 1024,
2870 K(zone_page_state(zone, NR_PAGETABLE)), 2870 K(zone_page_state(zone, NR_PAGETABLE)),
2871 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 2871 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2872 K(zone_page_state(zone, NR_BOUNCE)), 2872 K(zone_page_state(zone, NR_BOUNCE)),
2873 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2873 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2874 zone->pages_scanned, 2874 zone->pages_scanned,
2875 (zone->all_unreclaimable ? "yes" : "no") 2875 (zone->all_unreclaimable ? "yes" : "no")
2876 ); 2876 );
2877 printk("lowmem_reserve[]:"); 2877 printk("lowmem_reserve[]:");
2878 for (i = 0; i < MAX_NR_ZONES; i++) 2878 for (i = 0; i < MAX_NR_ZONES; i++)
2879 printk(" %lu", zone->lowmem_reserve[i]); 2879 printk(" %lu", zone->lowmem_reserve[i]);
2880 printk("\n"); 2880 printk("\n");
2881 } 2881 }
2882 2882
2883 for_each_populated_zone(zone) { 2883 for_each_populated_zone(zone) {
2884 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2884 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2885 2885
2886 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2886 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2887 continue; 2887 continue;
2888 show_node(zone); 2888 show_node(zone);
2889 printk("%s: ", zone->name); 2889 printk("%s: ", zone->name);
2890 2890
2891 spin_lock_irqsave(&zone->lock, flags); 2891 spin_lock_irqsave(&zone->lock, flags);
2892 for (order = 0; order < MAX_ORDER; order++) { 2892 for (order = 0; order < MAX_ORDER; order++) {
2893 nr[order] = zone->free_area[order].nr_free; 2893 nr[order] = zone->free_area[order].nr_free;
2894 total += nr[order] << order; 2894 total += nr[order] << order;
2895 } 2895 }
2896 spin_unlock_irqrestore(&zone->lock, flags); 2896 spin_unlock_irqrestore(&zone->lock, flags);
2897 for (order = 0; order < MAX_ORDER; order++) 2897 for (order = 0; order < MAX_ORDER; order++)
2898 printk("%lu*%lukB ", nr[order], K(1UL) << order); 2898 printk("%lu*%lukB ", nr[order], K(1UL) << order);
2899 printk("= %lukB\n", K(total)); 2899 printk("= %lukB\n", K(total));
2900 } 2900 }
2901 2901
2902 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 2902 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
2903 2903
2904 show_swap_cache_info(); 2904 show_swap_cache_info();
2905 } 2905 }
2906 2906
2907 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 2907 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2908 { 2908 {
2909 zoneref->zone = zone; 2909 zoneref->zone = zone;
2910 zoneref->zone_idx = zone_idx(zone); 2910 zoneref->zone_idx = zone_idx(zone);
2911 } 2911 }
2912 2912
2913 /* 2913 /*
2914 * Builds allocation fallback zone lists. 2914 * Builds allocation fallback zone lists.
2915 * 2915 *
2916 * Add all populated zones of a node to the zonelist. 2916 * Add all populated zones of a node to the zonelist.
2917 */ 2917 */
2918 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 2918 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
2919 int nr_zones, enum zone_type zone_type) 2919 int nr_zones, enum zone_type zone_type)
2920 { 2920 {
2921 struct zone *zone; 2921 struct zone *zone;
2922 2922
2923 BUG_ON(zone_type >= MAX_NR_ZONES); 2923 BUG_ON(zone_type >= MAX_NR_ZONES);
2924 zone_type++; 2924 zone_type++;
2925 2925
2926 do { 2926 do {
2927 zone_type--; 2927 zone_type--;
2928 zone = pgdat->node_zones + zone_type; 2928 zone = pgdat->node_zones + zone_type;
2929 if (populated_zone(zone)) { 2929 if (populated_zone(zone)) {
2930 zoneref_set_zone(zone, 2930 zoneref_set_zone(zone,
2931 &zonelist->_zonerefs[nr_zones++]); 2931 &zonelist->_zonerefs[nr_zones++]);
2932 check_highest_zone(zone_type); 2932 check_highest_zone(zone_type);
2933 } 2933 }
2934 2934
2935 } while (zone_type); 2935 } while (zone_type);
2936 return nr_zones; 2936 return nr_zones;
2937 } 2937 }
2938 2938
2939 2939
2940 /* 2940 /*
2941 * zonelist_order: 2941 * zonelist_order:
2942 * 0 = automatic detection of better ordering. 2942 * 0 = automatic detection of better ordering.
2943 * 1 = order by ([node] distance, -zonetype) 2943 * 1 = order by ([node] distance, -zonetype)
2944 * 2 = order by (-zonetype, [node] distance) 2944 * 2 = order by (-zonetype, [node] distance)
2945 * 2945 *
2946 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 2946 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
2947 * the same zonelist. So only NUMA can configure this param. 2947 * the same zonelist. So only NUMA can configure this param.
2948 */ 2948 */
2949 #define ZONELIST_ORDER_DEFAULT 0 2949 #define ZONELIST_ORDER_DEFAULT 0
2950 #define ZONELIST_ORDER_NODE 1 2950 #define ZONELIST_ORDER_NODE 1
2951 #define ZONELIST_ORDER_ZONE 2 2951 #define ZONELIST_ORDER_ZONE 2
2952 2952
2953 /* zonelist order in the kernel. 2953 /* zonelist order in the kernel.
2954 * set_zonelist_order() will set this to NODE or ZONE. 2954 * set_zonelist_order() will set this to NODE or ZONE.
2955 */ 2955 */
2956 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 2956 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
2957 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 2957 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
2958 2958
2959 2959
2960 #ifdef CONFIG_NUMA 2960 #ifdef CONFIG_NUMA
2961 /* The value user specified ....changed by config */ 2961 /* The value user specified ....changed by config */
2962 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 2962 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2963 /* string for sysctl */ 2963 /* string for sysctl */
2964 #define NUMA_ZONELIST_ORDER_LEN 16 2964 #define NUMA_ZONELIST_ORDER_LEN 16
2965 char numa_zonelist_order[16] = "default"; 2965 char numa_zonelist_order[16] = "default";
2966 2966
2967 /* 2967 /*
2968 * interface for configure zonelist ordering. 2968 * interface for configure zonelist ordering.
2969 * command line option "numa_zonelist_order" 2969 * command line option "numa_zonelist_order"
2970 * = "[dD]efault - default, automatic configuration. 2970 * = "[dD]efault - default, automatic configuration.
2971 * = "[nN]ode - order by node locality, then by zone within node 2971 * = "[nN]ode - order by node locality, then by zone within node
2972 * = "[zZ]one - order by zone, then by locality within zone 2972 * = "[zZ]one - order by zone, then by locality within zone
2973 */ 2973 */
2974 2974
2975 static int __parse_numa_zonelist_order(char *s) 2975 static int __parse_numa_zonelist_order(char *s)
2976 { 2976 {
2977 if (*s == 'd' || *s == 'D') { 2977 if (*s == 'd' || *s == 'D') {
2978 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 2978 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2979 } else if (*s == 'n' || *s == 'N') { 2979 } else if (*s == 'n' || *s == 'N') {
2980 user_zonelist_order = ZONELIST_ORDER_NODE; 2980 user_zonelist_order = ZONELIST_ORDER_NODE;
2981 } else if (*s == 'z' || *s == 'Z') { 2981 } else if (*s == 'z' || *s == 'Z') {
2982 user_zonelist_order = ZONELIST_ORDER_ZONE; 2982 user_zonelist_order = ZONELIST_ORDER_ZONE;
2983 } else { 2983 } else {
2984 printk(KERN_WARNING 2984 printk(KERN_WARNING
2985 "Ignoring invalid numa_zonelist_order value: " 2985 "Ignoring invalid numa_zonelist_order value: "
2986 "%s\n", s); 2986 "%s\n", s);
2987 return -EINVAL; 2987 return -EINVAL;
2988 } 2988 }
2989 return 0; 2989 return 0;
2990 } 2990 }
2991 2991
2992 static __init int setup_numa_zonelist_order(char *s) 2992 static __init int setup_numa_zonelist_order(char *s)
2993 { 2993 {
2994 int ret; 2994 int ret;
2995 2995
2996 if (!s) 2996 if (!s)
2997 return 0; 2997 return 0;
2998 2998
2999 ret = __parse_numa_zonelist_order(s); 2999 ret = __parse_numa_zonelist_order(s);
3000 if (ret == 0) 3000 if (ret == 0)
3001 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3001 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3002 3002
3003 return ret; 3003 return ret;
3004 } 3004 }
3005 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3005 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3006 3006
3007 /* 3007 /*
3008 * sysctl handler for numa_zonelist_order 3008 * sysctl handler for numa_zonelist_order
3009 */ 3009 */
3010 int numa_zonelist_order_handler(ctl_table *table, int write, 3010 int numa_zonelist_order_handler(ctl_table *table, int write,
3011 void __user *buffer, size_t *length, 3011 void __user *buffer, size_t *length,
3012 loff_t *ppos) 3012 loff_t *ppos)
3013 { 3013 {
3014 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3014 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3015 int ret; 3015 int ret;
3016 static DEFINE_MUTEX(zl_order_mutex); 3016 static DEFINE_MUTEX(zl_order_mutex);
3017 3017
3018 mutex_lock(&zl_order_mutex); 3018 mutex_lock(&zl_order_mutex);
3019 if (write) 3019 if (write)
3020 strcpy(saved_string, (char*)table->data); 3020 strcpy(saved_string, (char*)table->data);
3021 ret = proc_dostring(table, write, buffer, length, ppos); 3021 ret = proc_dostring(table, write, buffer, length, ppos);
3022 if (ret) 3022 if (ret)
3023 goto out; 3023 goto out;
3024 if (write) { 3024 if (write) {
3025 int oldval = user_zonelist_order; 3025 int oldval = user_zonelist_order;
3026 if (__parse_numa_zonelist_order((char*)table->data)) { 3026 if (__parse_numa_zonelist_order((char*)table->data)) {
3027 /* 3027 /*
3028 * bogus value. restore saved string 3028 * bogus value. restore saved string
3029 */ 3029 */
3030 strncpy((char*)table->data, saved_string, 3030 strncpy((char*)table->data, saved_string,
3031 NUMA_ZONELIST_ORDER_LEN); 3031 NUMA_ZONELIST_ORDER_LEN);
3032 user_zonelist_order = oldval; 3032 user_zonelist_order = oldval;
3033 } else if (oldval != user_zonelist_order) { 3033 } else if (oldval != user_zonelist_order) {
3034 mutex_lock(&zonelists_mutex); 3034 mutex_lock(&zonelists_mutex);
3035 build_all_zonelists(NULL); 3035 build_all_zonelists(NULL);
3036 mutex_unlock(&zonelists_mutex); 3036 mutex_unlock(&zonelists_mutex);
3037 } 3037 }
3038 } 3038 }
3039 out: 3039 out:
3040 mutex_unlock(&zl_order_mutex); 3040 mutex_unlock(&zl_order_mutex);
3041 return ret; 3041 return ret;
3042 } 3042 }
3043 3043
3044 3044
3045 #define MAX_NODE_LOAD (nr_online_nodes) 3045 #define MAX_NODE_LOAD (nr_online_nodes)
3046 static int node_load[MAX_NUMNODES]; 3046 static int node_load[MAX_NUMNODES];
3047 3047
3048 /** 3048 /**
3049 * find_next_best_node - find the next node that should appear in a given node's fallback list 3049 * find_next_best_node - find the next node that should appear in a given node's fallback list
3050 * @node: node whose fallback list we're appending 3050 * @node: node whose fallback list we're appending
3051 * @used_node_mask: nodemask_t of already used nodes 3051 * @used_node_mask: nodemask_t of already used nodes
3052 * 3052 *
3053 * We use a number of factors to determine which is the next node that should 3053 * We use a number of factors to determine which is the next node that should
3054 * appear on a given node's fallback list. The node should not have appeared 3054 * appear on a given node's fallback list. The node should not have appeared
3055 * already in @node's fallback list, and it should be the next closest node 3055 * already in @node's fallback list, and it should be the next closest node
3056 * according to the distance array (which contains arbitrary distance values 3056 * according to the distance array (which contains arbitrary distance values
3057 * from each node to each node in the system), and should also prefer nodes 3057 * from each node to each node in the system), and should also prefer nodes
3058 * with no CPUs, since presumably they'll have very little allocation pressure 3058 * with no CPUs, since presumably they'll have very little allocation pressure
3059 * on them otherwise. 3059 * on them otherwise.
3060 * It returns -1 if no node is found. 3060 * It returns -1 if no node is found.
3061 */ 3061 */
3062 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3062 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3063 { 3063 {
3064 int n, val; 3064 int n, val;
3065 int min_val = INT_MAX; 3065 int min_val = INT_MAX;
3066 int best_node = -1; 3066 int best_node = -1;
3067 const struct cpumask *tmp = cpumask_of_node(0); 3067 const struct cpumask *tmp = cpumask_of_node(0);
3068 3068
3069 /* Use the local node if we haven't already */ 3069 /* Use the local node if we haven't already */
3070 if (!node_isset(node, *used_node_mask)) { 3070 if (!node_isset(node, *used_node_mask)) {
3071 node_set(node, *used_node_mask); 3071 node_set(node, *used_node_mask);
3072 return node; 3072 return node;
3073 } 3073 }
3074 3074
3075 for_each_node_state(n, N_HIGH_MEMORY) { 3075 for_each_node_state(n, N_HIGH_MEMORY) {
3076 3076
3077 /* Don't want a node to appear more than once */ 3077 /* Don't want a node to appear more than once */
3078 if (node_isset(n, *used_node_mask)) 3078 if (node_isset(n, *used_node_mask))
3079 continue; 3079 continue;
3080 3080
3081 /* Use the distance array to find the distance */ 3081 /* Use the distance array to find the distance */
3082 val = node_distance(node, n); 3082 val = node_distance(node, n);
3083 3083
3084 /* Penalize nodes under us ("prefer the next node") */ 3084 /* Penalize nodes under us ("prefer the next node") */
3085 val += (n < node); 3085 val += (n < node);
3086 3086
3087 /* Give preference to headless and unused nodes */ 3087 /* Give preference to headless and unused nodes */
3088 tmp = cpumask_of_node(n); 3088 tmp = cpumask_of_node(n);
3089 if (!cpumask_empty(tmp)) 3089 if (!cpumask_empty(tmp))
3090 val += PENALTY_FOR_NODE_WITH_CPUS; 3090 val += PENALTY_FOR_NODE_WITH_CPUS;
3091 3091
3092 /* Slight preference for less loaded node */ 3092 /* Slight preference for less loaded node */
3093 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3093 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3094 val += node_load[n]; 3094 val += node_load[n];
3095 3095
3096 if (val < min_val) { 3096 if (val < min_val) {
3097 min_val = val; 3097 min_val = val;
3098 best_node = n; 3098 best_node = n;
3099 } 3099 }
3100 } 3100 }
3101 3101
3102 if (best_node >= 0) 3102 if (best_node >= 0)
3103 node_set(best_node, *used_node_mask); 3103 node_set(best_node, *used_node_mask);
3104 3104
3105 return best_node; 3105 return best_node;
3106 } 3106 }
3107 3107
3108 3108
3109 /* 3109 /*
3110 * Build zonelists ordered by node and zones within node. 3110 * Build zonelists ordered by node and zones within node.
3111 * This results in maximum locality--normal zone overflows into local 3111 * This results in maximum locality--normal zone overflows into local
3112 * DMA zone, if any--but risks exhausting DMA zone. 3112 * DMA zone, if any--but risks exhausting DMA zone.
3113 */ 3113 */
3114 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3114 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3115 { 3115 {
3116 int j; 3116 int j;
3117 struct zonelist *zonelist; 3117 struct zonelist *zonelist;
3118 3118
3119 zonelist = &pgdat->node_zonelists[0]; 3119 zonelist = &pgdat->node_zonelists[0];
3120 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3120 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3121 ; 3121 ;
3122 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3122 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3123 MAX_NR_ZONES - 1); 3123 MAX_NR_ZONES - 1);
3124 zonelist->_zonerefs[j].zone = NULL; 3124 zonelist->_zonerefs[j].zone = NULL;
3125 zonelist->_zonerefs[j].zone_idx = 0; 3125 zonelist->_zonerefs[j].zone_idx = 0;
3126 } 3126 }
3127 3127
3128 /* 3128 /*
3129 * Build gfp_thisnode zonelists 3129 * Build gfp_thisnode zonelists
3130 */ 3130 */
3131 static void build_thisnode_zonelists(pg_data_t *pgdat) 3131 static void build_thisnode_zonelists(pg_data_t *pgdat)
3132 { 3132 {
3133 int j; 3133 int j;
3134 struct zonelist *zonelist; 3134 struct zonelist *zonelist;
3135 3135
3136 zonelist = &pgdat->node_zonelists[1]; 3136 zonelist = &pgdat->node_zonelists[1];
3137 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3137 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3138 zonelist->_zonerefs[j].zone = NULL; 3138 zonelist->_zonerefs[j].zone = NULL;
3139 zonelist->_zonerefs[j].zone_idx = 0; 3139 zonelist->_zonerefs[j].zone_idx = 0;
3140 } 3140 }
3141 3141
3142 /* 3142 /*
3143 * Build zonelists ordered by zone and nodes within zones. 3143 * Build zonelists ordered by zone and nodes within zones.
3144 * This results in conserving DMA zone[s] until all Normal memory is 3144 * This results in conserving DMA zone[s] until all Normal memory is
3145 * exhausted, but results in overflowing to remote node while memory 3145 * exhausted, but results in overflowing to remote node while memory
3146 * may still exist in local DMA zone. 3146 * may still exist in local DMA zone.
3147 */ 3147 */
3148 static int node_order[MAX_NUMNODES]; 3148 static int node_order[MAX_NUMNODES];
3149 3149
3150 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3150 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3151 { 3151 {
3152 int pos, j, node; 3152 int pos, j, node;
3153 int zone_type; /* needs to be signed */ 3153 int zone_type; /* needs to be signed */
3154 struct zone *z; 3154 struct zone *z;
3155 struct zonelist *zonelist; 3155 struct zonelist *zonelist;
3156 3156
3157 zonelist = &pgdat->node_zonelists[0]; 3157 zonelist = &pgdat->node_zonelists[0];
3158 pos = 0; 3158 pos = 0;
3159 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3159 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3160 for (j = 0; j < nr_nodes; j++) { 3160 for (j = 0; j < nr_nodes; j++) {
3161 node = node_order[j]; 3161 node = node_order[j];
3162 z = &NODE_DATA(node)->node_zones[zone_type]; 3162 z = &NODE_DATA(node)->node_zones[zone_type];
3163 if (populated_zone(z)) { 3163 if (populated_zone(z)) {
3164 zoneref_set_zone(z, 3164 zoneref_set_zone(z,
3165 &zonelist->_zonerefs[pos++]); 3165 &zonelist->_zonerefs[pos++]);
3166 check_highest_zone(zone_type); 3166 check_highest_zone(zone_type);
3167 } 3167 }
3168 } 3168 }
3169 } 3169 }
3170 zonelist->_zonerefs[pos].zone = NULL; 3170 zonelist->_zonerefs[pos].zone = NULL;
3171 zonelist->_zonerefs[pos].zone_idx = 0; 3171 zonelist->_zonerefs[pos].zone_idx = 0;
3172 } 3172 }
3173 3173
3174 static int default_zonelist_order(void) 3174 static int default_zonelist_order(void)
3175 { 3175 {
3176 int nid, zone_type; 3176 int nid, zone_type;
3177 unsigned long low_kmem_size,total_size; 3177 unsigned long low_kmem_size,total_size;
3178 struct zone *z; 3178 struct zone *z;
3179 int average_size; 3179 int average_size;
3180 /* 3180 /*
3181 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3181 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3182 * If they are really small and used heavily, the system can fall 3182 * If they are really small and used heavily, the system can fall
3183 * into OOM very easily. 3183 * into OOM very easily.
3184 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3184 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3185 */ 3185 */
3186 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3186 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3187 low_kmem_size = 0; 3187 low_kmem_size = 0;
3188 total_size = 0; 3188 total_size = 0;
3189 for_each_online_node(nid) { 3189 for_each_online_node(nid) {
3190 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3190 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3191 z = &NODE_DATA(nid)->node_zones[zone_type]; 3191 z = &NODE_DATA(nid)->node_zones[zone_type];
3192 if (populated_zone(z)) { 3192 if (populated_zone(z)) {
3193 if (zone_type < ZONE_NORMAL) 3193 if (zone_type < ZONE_NORMAL)
3194 low_kmem_size += z->present_pages; 3194 low_kmem_size += z->present_pages;
3195 total_size += z->present_pages; 3195 total_size += z->present_pages;
3196 } else if (zone_type == ZONE_NORMAL) { 3196 } else if (zone_type == ZONE_NORMAL) {
3197 /* 3197 /*
3198 * If any node has only lowmem, then node order 3198 * If any node has only lowmem, then node order
3199 * is preferred to allow kernel allocations 3199 * is preferred to allow kernel allocations
3200 * locally; otherwise, they can easily infringe 3200 * locally; otherwise, they can easily infringe
3201 * on other nodes when there is an abundance of 3201 * on other nodes when there is an abundance of
3202 * lowmem available to allocate from. 3202 * lowmem available to allocate from.
3203 */ 3203 */
3204 return ZONELIST_ORDER_NODE; 3204 return ZONELIST_ORDER_NODE;
3205 } 3205 }
3206 } 3206 }
3207 } 3207 }
3208 if (!low_kmem_size || /* there are no DMA area. */ 3208 if (!low_kmem_size || /* there are no DMA area. */
3209 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3209 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3210 return ZONELIST_ORDER_NODE; 3210 return ZONELIST_ORDER_NODE;
3211 /* 3211 /*
3212 * look into each node's config. 3212 * look into each node's config.
3213 * If there is a node whose DMA/DMA32 memory is very big area on 3213 * If there is a node whose DMA/DMA32 memory is very big area on
3214 * local memory, NODE_ORDER may be suitable. 3214 * local memory, NODE_ORDER may be suitable.
3215 */ 3215 */
3216 average_size = total_size / 3216 average_size = total_size /
3217 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3217 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
3218 for_each_online_node(nid) { 3218 for_each_online_node(nid) {
3219 low_kmem_size = 0; 3219 low_kmem_size = 0;
3220 total_size = 0; 3220 total_size = 0;
3221 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3221 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3222 z = &NODE_DATA(nid)->node_zones[zone_type]; 3222 z = &NODE_DATA(nid)->node_zones[zone_type];
3223 if (populated_zone(z)) { 3223 if (populated_zone(z)) {
3224 if (zone_type < ZONE_NORMAL) 3224 if (zone_type < ZONE_NORMAL)
3225 low_kmem_size += z->present_pages; 3225 low_kmem_size += z->present_pages;
3226 total_size += z->present_pages; 3226 total_size += z->present_pages;
3227 } 3227 }
3228 } 3228 }
3229 if (low_kmem_size && 3229 if (low_kmem_size &&
3230 total_size > average_size && /* ignore small node */ 3230 total_size > average_size && /* ignore small node */
3231 low_kmem_size > total_size * 70/100) 3231 low_kmem_size > total_size * 70/100)
3232 return ZONELIST_ORDER_NODE; 3232 return ZONELIST_ORDER_NODE;
3233 } 3233 }
3234 return ZONELIST_ORDER_ZONE; 3234 return ZONELIST_ORDER_ZONE;
3235 } 3235 }
3236 3236
3237 static void set_zonelist_order(void) 3237 static void set_zonelist_order(void)
3238 { 3238 {
3239 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3239 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3240 current_zonelist_order = default_zonelist_order(); 3240 current_zonelist_order = default_zonelist_order();
3241 else 3241 else
3242 current_zonelist_order = user_zonelist_order; 3242 current_zonelist_order = user_zonelist_order;
3243 } 3243 }
3244 3244
3245 static void build_zonelists(pg_data_t *pgdat) 3245 static void build_zonelists(pg_data_t *pgdat)
3246 { 3246 {
3247 int j, node, load; 3247 int j, node, load;
3248 enum zone_type i; 3248 enum zone_type i;
3249 nodemask_t used_mask; 3249 nodemask_t used_mask;
3250 int local_node, prev_node; 3250 int local_node, prev_node;
3251 struct zonelist *zonelist; 3251 struct zonelist *zonelist;
3252 int order = current_zonelist_order; 3252 int order = current_zonelist_order;
3253 3253
3254 /* initialize zonelists */ 3254 /* initialize zonelists */
3255 for (i = 0; i < MAX_ZONELISTS; i++) { 3255 for (i = 0; i < MAX_ZONELISTS; i++) {
3256 zonelist = pgdat->node_zonelists + i; 3256 zonelist = pgdat->node_zonelists + i;
3257 zonelist->_zonerefs[0].zone = NULL; 3257 zonelist->_zonerefs[0].zone = NULL;
3258 zonelist->_zonerefs[0].zone_idx = 0; 3258 zonelist->_zonerefs[0].zone_idx = 0;
3259 } 3259 }
3260 3260
3261 /* NUMA-aware ordering of nodes */ 3261 /* NUMA-aware ordering of nodes */
3262 local_node = pgdat->node_id; 3262 local_node = pgdat->node_id;
3263 load = nr_online_nodes; 3263 load = nr_online_nodes;
3264 prev_node = local_node; 3264 prev_node = local_node;
3265 nodes_clear(used_mask); 3265 nodes_clear(used_mask);
3266 3266
3267 memset(node_order, 0, sizeof(node_order)); 3267 memset(node_order, 0, sizeof(node_order));
3268 j = 0; 3268 j = 0;
3269 3269
3270 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3270 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3271 int distance = node_distance(local_node, node); 3271 int distance = node_distance(local_node, node);
3272 3272
3273 /* 3273 /*
3274 * If another node is sufficiently far away then it is better 3274 * If another node is sufficiently far away then it is better
3275 * to reclaim pages in a zone before going off node. 3275 * to reclaim pages in a zone before going off node.
3276 */ 3276 */
3277 if (distance > RECLAIM_DISTANCE) 3277 if (distance > RECLAIM_DISTANCE)
3278 zone_reclaim_mode = 1; 3278 zone_reclaim_mode = 1;
3279 3279
3280 /* 3280 /*
3281 * We don't want to pressure a particular node. 3281 * We don't want to pressure a particular node.
3282 * So adding penalty to the first node in same 3282 * So adding penalty to the first node in same
3283 * distance group to make it round-robin. 3283 * distance group to make it round-robin.
3284 */ 3284 */
3285 if (distance != node_distance(local_node, prev_node)) 3285 if (distance != node_distance(local_node, prev_node))
3286 node_load[node] = load; 3286 node_load[node] = load;
3287 3287
3288 prev_node = node; 3288 prev_node = node;
3289 load--; 3289 load--;
3290 if (order == ZONELIST_ORDER_NODE) 3290 if (order == ZONELIST_ORDER_NODE)
3291 build_zonelists_in_node_order(pgdat, node); 3291 build_zonelists_in_node_order(pgdat, node);
3292 else 3292 else
3293 node_order[j++] = node; /* remember order */ 3293 node_order[j++] = node; /* remember order */
3294 } 3294 }
3295 3295
3296 if (order == ZONELIST_ORDER_ZONE) { 3296 if (order == ZONELIST_ORDER_ZONE) {
3297 /* calculate node order -- i.e., DMA last! */ 3297 /* calculate node order -- i.e., DMA last! */
3298 build_zonelists_in_zone_order(pgdat, j); 3298 build_zonelists_in_zone_order(pgdat, j);
3299 } 3299 }
3300 3300
3301 build_thisnode_zonelists(pgdat); 3301 build_thisnode_zonelists(pgdat);
3302 } 3302 }
3303 3303
3304 /* Construct the zonelist performance cache - see further mmzone.h */ 3304 /* Construct the zonelist performance cache - see further mmzone.h */
3305 static void build_zonelist_cache(pg_data_t *pgdat) 3305 static void build_zonelist_cache(pg_data_t *pgdat)
3306 { 3306 {
3307 struct zonelist *zonelist; 3307 struct zonelist *zonelist;
3308 struct zonelist_cache *zlc; 3308 struct zonelist_cache *zlc;
3309 struct zoneref *z; 3309 struct zoneref *z;
3310 3310
3311 zonelist = &pgdat->node_zonelists[0]; 3311 zonelist = &pgdat->node_zonelists[0];
3312 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3312 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3313 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3313 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3314 for (z = zonelist->_zonerefs; z->zone; z++) 3314 for (z = zonelist->_zonerefs; z->zone; z++)
3315 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3315 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3316 } 3316 }
3317 3317
3318 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3318 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3319 /* 3319 /*
3320 * Return node id of node used for "local" allocations. 3320 * Return node id of node used for "local" allocations.
3321 * I.e., first node id of first zone in arg node's generic zonelist. 3321 * I.e., first node id of first zone in arg node's generic zonelist.
3322 * Used for initializing percpu 'numa_mem', which is used primarily 3322 * Used for initializing percpu 'numa_mem', which is used primarily
3323 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3323 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3324 */ 3324 */
3325 int local_memory_node(int node) 3325 int local_memory_node(int node)
3326 { 3326 {
3327 struct zone *zone; 3327 struct zone *zone;
3328 3328
3329 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3329 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3330 gfp_zone(GFP_KERNEL), 3330 gfp_zone(GFP_KERNEL),
3331 NULL, 3331 NULL,
3332 &zone); 3332 &zone);
3333 return zone->node; 3333 return zone->node;
3334 } 3334 }
3335 #endif 3335 #endif
3336 3336
3337 #else /* CONFIG_NUMA */ 3337 #else /* CONFIG_NUMA */
3338 3338
3339 static void set_zonelist_order(void) 3339 static void set_zonelist_order(void)
3340 { 3340 {
3341 current_zonelist_order = ZONELIST_ORDER_ZONE; 3341 current_zonelist_order = ZONELIST_ORDER_ZONE;
3342 } 3342 }
3343 3343
3344 static void build_zonelists(pg_data_t *pgdat) 3344 static void build_zonelists(pg_data_t *pgdat)
3345 { 3345 {
3346 int node, local_node; 3346 int node, local_node;
3347 enum zone_type j; 3347 enum zone_type j;
3348 struct zonelist *zonelist; 3348 struct zonelist *zonelist;
3349 3349
3350 local_node = pgdat->node_id; 3350 local_node = pgdat->node_id;
3351 3351
3352 zonelist = &pgdat->node_zonelists[0]; 3352 zonelist = &pgdat->node_zonelists[0];
3353 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3353 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3354 3354
3355 /* 3355 /*
3356 * Now we build the zonelist so that it contains the zones 3356 * Now we build the zonelist so that it contains the zones
3357 * of all the other nodes. 3357 * of all the other nodes.
3358 * We don't want to pressure a particular node, so when 3358 * We don't want to pressure a particular node, so when
3359 * building the zones for node N, we make sure that the 3359 * building the zones for node N, we make sure that the
3360 * zones coming right after the local ones are those from 3360 * zones coming right after the local ones are those from
3361 * node N+1 (modulo N) 3361 * node N+1 (modulo N)
3362 */ 3362 */
3363 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3363 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3364 if (!node_online(node)) 3364 if (!node_online(node))
3365 continue; 3365 continue;
3366 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3366 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3367 MAX_NR_ZONES - 1); 3367 MAX_NR_ZONES - 1);
3368 } 3368 }
3369 for (node = 0; node < local_node; node++) { 3369 for (node = 0; node < local_node; node++) {
3370 if (!node_online(node)) 3370 if (!node_online(node))
3371 continue; 3371 continue;
3372 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3372 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3373 MAX_NR_ZONES - 1); 3373 MAX_NR_ZONES - 1);
3374 } 3374 }
3375 3375
3376 zonelist->_zonerefs[j].zone = NULL; 3376 zonelist->_zonerefs[j].zone = NULL;
3377 zonelist->_zonerefs[j].zone_idx = 0; 3377 zonelist->_zonerefs[j].zone_idx = 0;
3378 } 3378 }
3379 3379
3380 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3380 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3381 static void build_zonelist_cache(pg_data_t *pgdat) 3381 static void build_zonelist_cache(pg_data_t *pgdat)
3382 { 3382 {
3383 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3383 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3384 } 3384 }
3385 3385
3386 #endif /* CONFIG_NUMA */ 3386 #endif /* CONFIG_NUMA */
3387 3387
3388 /* 3388 /*
3389 * Boot pageset table. One per cpu which is going to be used for all 3389 * Boot pageset table. One per cpu which is going to be used for all
3390 * zones and all nodes. The parameters will be set in such a way 3390 * zones and all nodes. The parameters will be set in such a way
3391 * that an item put on a list will immediately be handed over to 3391 * that an item put on a list will immediately be handed over to
3392 * the buddy list. This is safe since pageset manipulation is done 3392 * the buddy list. This is safe since pageset manipulation is done
3393 * with interrupts disabled. 3393 * with interrupts disabled.
3394 * 3394 *
3395 * The boot_pagesets must be kept even after bootup is complete for 3395 * The boot_pagesets must be kept even after bootup is complete for
3396 * unused processors and/or zones. They do play a role for bootstrapping 3396 * unused processors and/or zones. They do play a role for bootstrapping
3397 * hotplugged processors. 3397 * hotplugged processors.
3398 * 3398 *
3399 * zoneinfo_show() and maybe other functions do 3399 * zoneinfo_show() and maybe other functions do
3400 * not check if the processor is online before following the pageset pointer. 3400 * not check if the processor is online before following the pageset pointer.
3401 * Other parts of the kernel may not check if the zone is available. 3401 * Other parts of the kernel may not check if the zone is available.
3402 */ 3402 */
3403 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3403 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3404 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3404 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3405 static void setup_zone_pageset(struct zone *zone); 3405 static void setup_zone_pageset(struct zone *zone);
3406 3406
3407 /* 3407 /*
3408 * Global mutex to protect against size modification of zonelists 3408 * Global mutex to protect against size modification of zonelists
3409 * as well as to serialize pageset setup for the new populated zone. 3409 * as well as to serialize pageset setup for the new populated zone.
3410 */ 3410 */
3411 DEFINE_MUTEX(zonelists_mutex); 3411 DEFINE_MUTEX(zonelists_mutex);
3412 3412
3413 /* return values int ....just for stop_machine() */ 3413 /* return values int ....just for stop_machine() */
3414 static __init_refok int __build_all_zonelists(void *data) 3414 static __init_refok int __build_all_zonelists(void *data)
3415 { 3415 {
3416 int nid; 3416 int nid;
3417 int cpu; 3417 int cpu;
3418 3418
3419 #ifdef CONFIG_NUMA 3419 #ifdef CONFIG_NUMA
3420 memset(node_load, 0, sizeof(node_load)); 3420 memset(node_load, 0, sizeof(node_load));
3421 #endif 3421 #endif
3422 for_each_online_node(nid) { 3422 for_each_online_node(nid) {
3423 pg_data_t *pgdat = NODE_DATA(nid); 3423 pg_data_t *pgdat = NODE_DATA(nid);
3424 3424
3425 build_zonelists(pgdat); 3425 build_zonelists(pgdat);
3426 build_zonelist_cache(pgdat); 3426 build_zonelist_cache(pgdat);
3427 } 3427 }
3428 3428
3429 /* 3429 /*
3430 * Initialize the boot_pagesets that are going to be used 3430 * Initialize the boot_pagesets that are going to be used
3431 * for bootstrapping processors. The real pagesets for 3431 * for bootstrapping processors. The real pagesets for
3432 * each zone will be allocated later when the per cpu 3432 * each zone will be allocated later when the per cpu
3433 * allocator is available. 3433 * allocator is available.
3434 * 3434 *
3435 * boot_pagesets are used also for bootstrapping offline 3435 * boot_pagesets are used also for bootstrapping offline
3436 * cpus if the system is already booted because the pagesets 3436 * cpus if the system is already booted because the pagesets
3437 * are needed to initialize allocators on a specific cpu too. 3437 * are needed to initialize allocators on a specific cpu too.
3438 * F.e. the percpu allocator needs the page allocator which 3438 * F.e. the percpu allocator needs the page allocator which
3439 * needs the percpu allocator in order to allocate its pagesets 3439 * needs the percpu allocator in order to allocate its pagesets
3440 * (a chicken-egg dilemma). 3440 * (a chicken-egg dilemma).
3441 */ 3441 */
3442 for_each_possible_cpu(cpu) { 3442 for_each_possible_cpu(cpu) {
3443 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3443 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3444 3444
3445 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3445 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3446 /* 3446 /*
3447 * We now know the "local memory node" for each node-- 3447 * We now know the "local memory node" for each node--
3448 * i.e., the node of the first zone in the generic zonelist. 3448 * i.e., the node of the first zone in the generic zonelist.
3449 * Set up numa_mem percpu variable for on-line cpus. During 3449 * Set up numa_mem percpu variable for on-line cpus. During
3450 * boot, only the boot cpu should be on-line; we'll init the 3450 * boot, only the boot cpu should be on-line; we'll init the
3451 * secondary cpus' numa_mem as they come on-line. During 3451 * secondary cpus' numa_mem as they come on-line. During
3452 * node/memory hotplug, we'll fixup all on-line cpus. 3452 * node/memory hotplug, we'll fixup all on-line cpus.
3453 */ 3453 */
3454 if (cpu_online(cpu)) 3454 if (cpu_online(cpu))
3455 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3455 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3456 #endif 3456 #endif
3457 } 3457 }
3458 3458
3459 return 0; 3459 return 0;
3460 } 3460 }
3461 3461
3462 /* 3462 /*
3463 * Called with zonelists_mutex held always 3463 * Called with zonelists_mutex held always
3464 * unless system_state == SYSTEM_BOOTING. 3464 * unless system_state == SYSTEM_BOOTING.
3465 */ 3465 */
3466 void __ref build_all_zonelists(void *data) 3466 void __ref build_all_zonelists(void *data)
3467 { 3467 {
3468 set_zonelist_order(); 3468 set_zonelist_order();
3469 3469
3470 if (system_state == SYSTEM_BOOTING) { 3470 if (system_state == SYSTEM_BOOTING) {
3471 __build_all_zonelists(NULL); 3471 __build_all_zonelists(NULL);
3472 mminit_verify_zonelist(); 3472 mminit_verify_zonelist();
3473 cpuset_init_current_mems_allowed(); 3473 cpuset_init_current_mems_allowed();
3474 } else { 3474 } else {
3475 /* we have to stop all cpus to guarantee there is no user 3475 /* we have to stop all cpus to guarantee there is no user
3476 of zonelist */ 3476 of zonelist */
3477 #ifdef CONFIG_MEMORY_HOTPLUG 3477 #ifdef CONFIG_MEMORY_HOTPLUG
3478 if (data) 3478 if (data)
3479 setup_zone_pageset((struct zone *)data); 3479 setup_zone_pageset((struct zone *)data);
3480 #endif 3480 #endif
3481 stop_machine(__build_all_zonelists, NULL, NULL); 3481 stop_machine(__build_all_zonelists, NULL, NULL);
3482 /* cpuset refresh routine should be here */ 3482 /* cpuset refresh routine should be here */
3483 } 3483 }
3484 vm_total_pages = nr_free_pagecache_pages(); 3484 vm_total_pages = nr_free_pagecache_pages();
3485 /* 3485 /*
3486 * Disable grouping by mobility if the number of pages in the 3486 * Disable grouping by mobility if the number of pages in the
3487 * system is too low to allow the mechanism to work. It would be 3487 * system is too low to allow the mechanism to work. It would be
3488 * more accurate, but expensive to check per-zone. This check is 3488 * more accurate, but expensive to check per-zone. This check is
3489 * made on memory-hotadd so a system can start with mobility 3489 * made on memory-hotadd so a system can start with mobility
3490 * disabled and enable it later 3490 * disabled and enable it later
3491 */ 3491 */
3492 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3492 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3493 page_group_by_mobility_disabled = 1; 3493 page_group_by_mobility_disabled = 1;
3494 else 3494 else
3495 page_group_by_mobility_disabled = 0; 3495 page_group_by_mobility_disabled = 0;
3496 3496
3497 printk("Built %i zonelists in %s order, mobility grouping %s. " 3497 printk("Built %i zonelists in %s order, mobility grouping %s. "
3498 "Total pages: %ld\n", 3498 "Total pages: %ld\n",
3499 nr_online_nodes, 3499 nr_online_nodes,
3500 zonelist_order_name[current_zonelist_order], 3500 zonelist_order_name[current_zonelist_order],
3501 page_group_by_mobility_disabled ? "off" : "on", 3501 page_group_by_mobility_disabled ? "off" : "on",
3502 vm_total_pages); 3502 vm_total_pages);
3503 #ifdef CONFIG_NUMA 3503 #ifdef CONFIG_NUMA
3504 printk("Policy zone: %s\n", zone_names[policy_zone]); 3504 printk("Policy zone: %s\n", zone_names[policy_zone]);
3505 #endif 3505 #endif
3506 } 3506 }
3507 3507
3508 /* 3508 /*
3509 * Helper functions to size the waitqueue hash table. 3509 * Helper functions to size the waitqueue hash table.
3510 * Essentially these want to choose hash table sizes sufficiently 3510 * Essentially these want to choose hash table sizes sufficiently
3511 * large so that collisions trying to wait on pages are rare. 3511 * large so that collisions trying to wait on pages are rare.
3512 * But in fact, the number of active page waitqueues on typical 3512 * But in fact, the number of active page waitqueues on typical
3513 * systems is ridiculously low, less than 200. So this is even 3513 * systems is ridiculously low, less than 200. So this is even
3514 * conservative, even though it seems large. 3514 * conservative, even though it seems large.
3515 * 3515 *
3516 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3516 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3517 * waitqueues, i.e. the size of the waitq table given the number of pages. 3517 * waitqueues, i.e. the size of the waitq table given the number of pages.
3518 */ 3518 */
3519 #define PAGES_PER_WAITQUEUE 256 3519 #define PAGES_PER_WAITQUEUE 256
3520 3520
3521 #ifndef CONFIG_MEMORY_HOTPLUG 3521 #ifndef CONFIG_MEMORY_HOTPLUG
3522 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3522 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3523 { 3523 {
3524 unsigned long size = 1; 3524 unsigned long size = 1;
3525 3525
3526 pages /= PAGES_PER_WAITQUEUE; 3526 pages /= PAGES_PER_WAITQUEUE;
3527 3527
3528 while (size < pages) 3528 while (size < pages)
3529 size <<= 1; 3529 size <<= 1;
3530 3530
3531 /* 3531 /*
3532 * Once we have dozens or even hundreds of threads sleeping 3532 * Once we have dozens or even hundreds of threads sleeping
3533 * on IO we've got bigger problems than wait queue collision. 3533 * on IO we've got bigger problems than wait queue collision.
3534 * Limit the size of the wait table to a reasonable size. 3534 * Limit the size of the wait table to a reasonable size.
3535 */ 3535 */
3536 size = min(size, 4096UL); 3536 size = min(size, 4096UL);
3537 3537
3538 return max(size, 4UL); 3538 return max(size, 4UL);
3539 } 3539 }
3540 #else 3540 #else
3541 /* 3541 /*
3542 * A zone's size might be changed by hot-add, so it is not possible to determine 3542 * A zone's size might be changed by hot-add, so it is not possible to determine
3543 * a suitable size for its wait_table. So we use the maximum size now. 3543 * a suitable size for its wait_table. So we use the maximum size now.
3544 * 3544 *
3545 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3545 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3546 * 3546 *
3547 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3547 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3548 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3548 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3549 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3549 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3550 * 3550 *
3551 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3551 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3552 * or more by the traditional way. (See above). It equals: 3552 * or more by the traditional way. (See above). It equals:
3553 * 3553 *
3554 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3554 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3555 * ia64(16K page size) : = ( 8G + 4M)byte. 3555 * ia64(16K page size) : = ( 8G + 4M)byte.
3556 * powerpc (64K page size) : = (32G +16M)byte. 3556 * powerpc (64K page size) : = (32G +16M)byte.
3557 */ 3557 */
3558 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3558 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3559 { 3559 {
3560 return 4096UL; 3560 return 4096UL;
3561 } 3561 }
3562 #endif 3562 #endif
3563 3563
3564 /* 3564 /*
3565 * This is an integer logarithm so that shifts can be used later 3565 * This is an integer logarithm so that shifts can be used later
3566 * to extract the more random high bits from the multiplicative 3566 * to extract the more random high bits from the multiplicative
3567 * hash function before the remainder is taken. 3567 * hash function before the remainder is taken.
3568 */ 3568 */
3569 static inline unsigned long wait_table_bits(unsigned long size) 3569 static inline unsigned long wait_table_bits(unsigned long size)
3570 { 3570 {
3571 return ffz(~size); 3571 return ffz(~size);
3572 } 3572 }
3573 3573
3574 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3574 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3575 3575
3576 /* 3576 /*
3577 * Check if a pageblock contains reserved pages 3577 * Check if a pageblock contains reserved pages
3578 */ 3578 */
3579 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3579 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3580 { 3580 {
3581 unsigned long pfn; 3581 unsigned long pfn;
3582 3582
3583 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3583 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3584 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3584 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3585 return 1; 3585 return 1;
3586 } 3586 }
3587 return 0; 3587 return 0;
3588 } 3588 }
3589 3589
3590 /* 3590 /*
3591 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3591 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3592 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3592 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3593 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3593 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3594 * higher will lead to a bigger reserve which will get freed as contiguous 3594 * higher will lead to a bigger reserve which will get freed as contiguous
3595 * blocks as reclaim kicks in 3595 * blocks as reclaim kicks in
3596 */ 3596 */
3597 static void setup_zone_migrate_reserve(struct zone *zone) 3597 static void setup_zone_migrate_reserve(struct zone *zone)
3598 { 3598 {
3599 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3599 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3600 struct page *page; 3600 struct page *page;
3601 unsigned long block_migratetype; 3601 unsigned long block_migratetype;
3602 int reserve; 3602 int reserve;
3603 3603
3604 /* 3604 /*
3605 * Get the start pfn, end pfn and the number of blocks to reserve 3605 * Get the start pfn, end pfn and the number of blocks to reserve
3606 * We have to be careful to be aligned to pageblock_nr_pages to 3606 * We have to be careful to be aligned to pageblock_nr_pages to
3607 * make sure that we always check pfn_valid for the first page in 3607 * make sure that we always check pfn_valid for the first page in
3608 * the block. 3608 * the block.
3609 */ 3609 */
3610 start_pfn = zone->zone_start_pfn; 3610 start_pfn = zone->zone_start_pfn;
3611 end_pfn = start_pfn + zone->spanned_pages; 3611 end_pfn = start_pfn + zone->spanned_pages;
3612 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3612 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3613 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3613 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3614 pageblock_order; 3614 pageblock_order;
3615 3615
3616 /* 3616 /*
3617 * Reserve blocks are generally in place to help high-order atomic 3617 * Reserve blocks are generally in place to help high-order atomic
3618 * allocations that are short-lived. A min_free_kbytes value that 3618 * allocations that are short-lived. A min_free_kbytes value that
3619 * would result in more than 2 reserve blocks for atomic allocations 3619 * would result in more than 2 reserve blocks for atomic allocations
3620 * is assumed to be in place to help anti-fragmentation for the 3620 * is assumed to be in place to help anti-fragmentation for the
3621 * future allocation of hugepages at runtime. 3621 * future allocation of hugepages at runtime.
3622 */ 3622 */
3623 reserve = min(2, reserve); 3623 reserve = min(2, reserve);
3624 3624
3625 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3625 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3626 if (!pfn_valid(pfn)) 3626 if (!pfn_valid(pfn))
3627 continue; 3627 continue;
3628 page = pfn_to_page(pfn); 3628 page = pfn_to_page(pfn);
3629 3629
3630 /* Watch out for overlapping nodes */ 3630 /* Watch out for overlapping nodes */
3631 if (page_to_nid(page) != zone_to_nid(zone)) 3631 if (page_to_nid(page) != zone_to_nid(zone))
3632 continue; 3632 continue;
3633 3633
3634 block_migratetype = get_pageblock_migratetype(page); 3634 block_migratetype = get_pageblock_migratetype(page);
3635 3635
3636 /* Only test what is necessary when the reserves are not met */ 3636 /* Only test what is necessary when the reserves are not met */
3637 if (reserve > 0) { 3637 if (reserve > 0) {
3638 /* 3638 /*
3639 * Blocks with reserved pages will never free, skip 3639 * Blocks with reserved pages will never free, skip
3640 * them. 3640 * them.
3641 */ 3641 */
3642 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3642 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3643 if (pageblock_is_reserved(pfn, block_end_pfn)) 3643 if (pageblock_is_reserved(pfn, block_end_pfn))
3644 continue; 3644 continue;
3645 3645
3646 /* If this block is reserved, account for it */ 3646 /* If this block is reserved, account for it */
3647 if (block_migratetype == MIGRATE_RESERVE) { 3647 if (block_migratetype == MIGRATE_RESERVE) {
3648 reserve--; 3648 reserve--;
3649 continue; 3649 continue;
3650 } 3650 }
3651 3651
3652 /* Suitable for reserving if this block is movable */ 3652 /* Suitable for reserving if this block is movable */
3653 if (block_migratetype == MIGRATE_MOVABLE) { 3653 if (block_migratetype == MIGRATE_MOVABLE) {
3654 set_pageblock_migratetype(page, 3654 set_pageblock_migratetype(page,
3655 MIGRATE_RESERVE); 3655 MIGRATE_RESERVE);
3656 move_freepages_block(zone, page, 3656 move_freepages_block(zone, page,
3657 MIGRATE_RESERVE); 3657 MIGRATE_RESERVE);
3658 reserve--; 3658 reserve--;
3659 continue; 3659 continue;
3660 } 3660 }
3661 } 3661 }
3662 3662
3663 /* 3663 /*
3664 * If the reserve is met and this is a previous reserved block, 3664 * If the reserve is met and this is a previous reserved block,
3665 * take it back 3665 * take it back
3666 */ 3666 */
3667 if (block_migratetype == MIGRATE_RESERVE) { 3667 if (block_migratetype == MIGRATE_RESERVE) {
3668 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3668 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3669 move_freepages_block(zone, page, MIGRATE_MOVABLE); 3669 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3670 } 3670 }
3671 } 3671 }
3672 } 3672 }
3673 3673
3674 /* 3674 /*
3675 * Initially all pages are reserved - free ones are freed 3675 * Initially all pages are reserved - free ones are freed
3676 * up by free_all_bootmem() once the early boot process is 3676 * up by free_all_bootmem() once the early boot process is
3677 * done. Non-atomic initialization, single-pass. 3677 * done. Non-atomic initialization, single-pass.
3678 */ 3678 */
3679 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 3679 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3680 unsigned long start_pfn, enum memmap_context context) 3680 unsigned long start_pfn, enum memmap_context context)
3681 { 3681 {
3682 struct page *page; 3682 struct page *page;
3683 unsigned long end_pfn = start_pfn + size; 3683 unsigned long end_pfn = start_pfn + size;
3684 unsigned long pfn; 3684 unsigned long pfn;
3685 struct zone *z; 3685 struct zone *z;
3686 3686
3687 if (highest_memmap_pfn < end_pfn - 1) 3687 if (highest_memmap_pfn < end_pfn - 1)
3688 highest_memmap_pfn = end_pfn - 1; 3688 highest_memmap_pfn = end_pfn - 1;
3689 3689
3690 z = &NODE_DATA(nid)->node_zones[zone]; 3690 z = &NODE_DATA(nid)->node_zones[zone];
3691 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3691 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3692 /* 3692 /*
3693 * There can be holes in boot-time mem_map[]s 3693 * There can be holes in boot-time mem_map[]s
3694 * handed to this function. They do not 3694 * handed to this function. They do not
3695 * exist on hotplugged memory. 3695 * exist on hotplugged memory.
3696 */ 3696 */
3697 if (context == MEMMAP_EARLY) { 3697 if (context == MEMMAP_EARLY) {
3698 if (!early_pfn_valid(pfn)) 3698 if (!early_pfn_valid(pfn))
3699 continue; 3699 continue;
3700 if (!early_pfn_in_nid(pfn, nid)) 3700 if (!early_pfn_in_nid(pfn, nid))
3701 continue; 3701 continue;
3702 } 3702 }
3703 page = pfn_to_page(pfn); 3703 page = pfn_to_page(pfn);
3704 set_page_links(page, zone, nid, pfn); 3704 set_page_links(page, zone, nid, pfn);
3705 mminit_verify_page_links(page, zone, nid, pfn); 3705 mminit_verify_page_links(page, zone, nid, pfn);
3706 init_page_count(page); 3706 init_page_count(page);
3707 reset_page_mapcount(page); 3707 reset_page_mapcount(page);
3708 SetPageReserved(page); 3708 SetPageReserved(page);
3709 /* 3709 /*
3710 * Mark the block movable so that blocks are reserved for 3710 * Mark the block movable so that blocks are reserved for
3711 * movable at startup. This will force kernel allocations 3711 * movable at startup. This will force kernel allocations
3712 * to reserve their blocks rather than leaking throughout 3712 * to reserve their blocks rather than leaking throughout
3713 * the address space during boot when many long-lived 3713 * the address space during boot when many long-lived
3714 * kernel allocations are made. Later some blocks near 3714 * kernel allocations are made. Later some blocks near
3715 * the start are marked MIGRATE_RESERVE by 3715 * the start are marked MIGRATE_RESERVE by
3716 * setup_zone_migrate_reserve() 3716 * setup_zone_migrate_reserve()
3717 * 3717 *
3718 * bitmap is created for zone's valid pfn range. but memmap 3718 * bitmap is created for zone's valid pfn range. but memmap
3719 * can be created for invalid pages (for alignment) 3719 * can be created for invalid pages (for alignment)
3720 * check here not to call set_pageblock_migratetype() against 3720 * check here not to call set_pageblock_migratetype() against
3721 * pfn out of zone. 3721 * pfn out of zone.
3722 */ 3722 */
3723 if ((z->zone_start_pfn <= pfn) 3723 if ((z->zone_start_pfn <= pfn)
3724 && (pfn < z->zone_start_pfn + z->spanned_pages) 3724 && (pfn < z->zone_start_pfn + z->spanned_pages)
3725 && !(pfn & (pageblock_nr_pages - 1))) 3725 && !(pfn & (pageblock_nr_pages - 1)))
3726 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3726 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3727 3727
3728 INIT_LIST_HEAD(&page->lru); 3728 INIT_LIST_HEAD(&page->lru);
3729 #ifdef WANT_PAGE_VIRTUAL 3729 #ifdef WANT_PAGE_VIRTUAL
3730 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 3730 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
3731 if (!is_highmem_idx(zone)) 3731 if (!is_highmem_idx(zone))
3732 set_page_address(page, __va(pfn << PAGE_SHIFT)); 3732 set_page_address(page, __va(pfn << PAGE_SHIFT));
3733 #endif 3733 #endif
3734 } 3734 }
3735 } 3735 }
3736 3736
3737 static void __meminit zone_init_free_lists(struct zone *zone) 3737 static void __meminit zone_init_free_lists(struct zone *zone)
3738 { 3738 {
3739 int order, t; 3739 int order, t;
3740 for_each_migratetype_order(order, t) { 3740 for_each_migratetype_order(order, t) {
3741 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 3741 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3742 zone->free_area[order].nr_free = 0; 3742 zone->free_area[order].nr_free = 0;
3743 } 3743 }
3744 } 3744 }
3745 3745
3746 #ifndef __HAVE_ARCH_MEMMAP_INIT 3746 #ifndef __HAVE_ARCH_MEMMAP_INIT
3747 #define memmap_init(size, nid, zone, start_pfn) \ 3747 #define memmap_init(size, nid, zone, start_pfn) \
3748 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3748 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3749 #endif 3749 #endif
3750 3750
3751 static int zone_batchsize(struct zone *zone) 3751 static int zone_batchsize(struct zone *zone)
3752 { 3752 {
3753 #ifdef CONFIG_MMU 3753 #ifdef CONFIG_MMU
3754 int batch; 3754 int batch;
3755 3755
3756 /* 3756 /*
3757 * The per-cpu-pages pools are set to around 1000th of the 3757 * The per-cpu-pages pools are set to around 1000th of the
3758 * size of the zone. But no more than 1/2 of a meg. 3758 * size of the zone. But no more than 1/2 of a meg.
3759 * 3759 *
3760 * OK, so we don't know how big the cache is. So guess. 3760 * OK, so we don't know how big the cache is. So guess.
3761 */ 3761 */
3762 batch = zone->present_pages / 1024; 3762 batch = zone->present_pages / 1024;
3763 if (batch * PAGE_SIZE > 512 * 1024) 3763 if (batch * PAGE_SIZE > 512 * 1024)
3764 batch = (512 * 1024) / PAGE_SIZE; 3764 batch = (512 * 1024) / PAGE_SIZE;
3765 batch /= 4; /* We effectively *= 4 below */ 3765 batch /= 4; /* We effectively *= 4 below */
3766 if (batch < 1) 3766 if (batch < 1)
3767 batch = 1; 3767 batch = 1;
3768 3768
3769 /* 3769 /*
3770 * Clamp the batch to a 2^n - 1 value. Having a power 3770 * Clamp the batch to a 2^n - 1 value. Having a power
3771 * of 2 value was found to be more likely to have 3771 * of 2 value was found to be more likely to have
3772 * suboptimal cache aliasing properties in some cases. 3772 * suboptimal cache aliasing properties in some cases.
3773 * 3773 *
3774 * For example if 2 tasks are alternately allocating 3774 * For example if 2 tasks are alternately allocating
3775 * batches of pages, one task can end up with a lot 3775 * batches of pages, one task can end up with a lot
3776 * of pages of one half of the possible page colors 3776 * of pages of one half of the possible page colors
3777 * and the other with pages of the other colors. 3777 * and the other with pages of the other colors.
3778 */ 3778 */
3779 batch = rounddown_pow_of_two(batch + batch/2) - 1; 3779 batch = rounddown_pow_of_two(batch + batch/2) - 1;
3780 3780
3781 return batch; 3781 return batch;
3782 3782
3783 #else 3783 #else
3784 /* The deferral and batching of frees should be suppressed under NOMMU 3784 /* The deferral and batching of frees should be suppressed under NOMMU
3785 * conditions. 3785 * conditions.
3786 * 3786 *
3787 * The problem is that NOMMU needs to be able to allocate large chunks 3787 * The problem is that NOMMU needs to be able to allocate large chunks
3788 * of contiguous memory as there's no hardware page translation to 3788 * of contiguous memory as there's no hardware page translation to
3789 * assemble apparent contiguous memory from discontiguous pages. 3789 * assemble apparent contiguous memory from discontiguous pages.
3790 * 3790 *
3791 * Queueing large contiguous runs of pages for batching, however, 3791 * Queueing large contiguous runs of pages for batching, however,
3792 * causes the pages to actually be freed in smaller chunks. As there 3792 * causes the pages to actually be freed in smaller chunks. As there
3793 * can be a significant delay between the individual batches being 3793 * can be a significant delay between the individual batches being
3794 * recycled, this leads to the once large chunks of space being 3794 * recycled, this leads to the once large chunks of space being
3795 * fragmented and becoming unavailable for high-order allocations. 3795 * fragmented and becoming unavailable for high-order allocations.
3796 */ 3796 */
3797 return 0; 3797 return 0;
3798 #endif 3798 #endif
3799 } 3799 }
3800 3800
3801 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3801 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3802 { 3802 {
3803 struct per_cpu_pages *pcp; 3803 struct per_cpu_pages *pcp;
3804 int migratetype; 3804 int migratetype;
3805 3805
3806 memset(p, 0, sizeof(*p)); 3806 memset(p, 0, sizeof(*p));
3807 3807
3808 pcp = &p->pcp; 3808 pcp = &p->pcp;
3809 pcp->count = 0; 3809 pcp->count = 0;
3810 pcp->high = 6 * batch; 3810 pcp->high = 6 * batch;
3811 pcp->batch = max(1UL, 1 * batch); 3811 pcp->batch = max(1UL, 1 * batch);
3812 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 3812 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3813 INIT_LIST_HEAD(&pcp->lists[migratetype]); 3813 INIT_LIST_HEAD(&pcp->lists[migratetype]);
3814 } 3814 }
3815 3815
3816 /* 3816 /*
3817 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 3817 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
3818 * to the value high for the pageset p. 3818 * to the value high for the pageset p.
3819 */ 3819 */
3820 3820
3821 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 3821 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3822 unsigned long high) 3822 unsigned long high)
3823 { 3823 {
3824 struct per_cpu_pages *pcp; 3824 struct per_cpu_pages *pcp;
3825 3825
3826 pcp = &p->pcp; 3826 pcp = &p->pcp;
3827 pcp->high = high; 3827 pcp->high = high;
3828 pcp->batch = max(1UL, high/4); 3828 pcp->batch = max(1UL, high/4);
3829 if ((high/4) > (PAGE_SHIFT * 8)) 3829 if ((high/4) > (PAGE_SHIFT * 8))
3830 pcp->batch = PAGE_SHIFT * 8; 3830 pcp->batch = PAGE_SHIFT * 8;
3831 } 3831 }
3832 3832
3833 static void setup_zone_pageset(struct zone *zone) 3833 static void setup_zone_pageset(struct zone *zone)
3834 { 3834 {
3835 int cpu; 3835 int cpu;
3836 3836
3837 zone->pageset = alloc_percpu(struct per_cpu_pageset); 3837 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3838 3838
3839 for_each_possible_cpu(cpu) { 3839 for_each_possible_cpu(cpu) {
3840 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 3840 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3841 3841
3842 setup_pageset(pcp, zone_batchsize(zone)); 3842 setup_pageset(pcp, zone_batchsize(zone));
3843 3843
3844 if (percpu_pagelist_fraction) 3844 if (percpu_pagelist_fraction)
3845 setup_pagelist_highmark(pcp, 3845 setup_pagelist_highmark(pcp,
3846 (zone->present_pages / 3846 (zone->present_pages /
3847 percpu_pagelist_fraction)); 3847 percpu_pagelist_fraction));
3848 } 3848 }
3849 } 3849 }
3850 3850
3851 /* 3851 /*
3852 * Allocate per cpu pagesets and initialize them. 3852 * Allocate per cpu pagesets and initialize them.
3853 * Before this call only boot pagesets were available. 3853 * Before this call only boot pagesets were available.
3854 */ 3854 */
3855 void __init setup_per_cpu_pageset(void) 3855 void __init setup_per_cpu_pageset(void)
3856 { 3856 {
3857 struct zone *zone; 3857 struct zone *zone;
3858 3858
3859 for_each_populated_zone(zone) 3859 for_each_populated_zone(zone)
3860 setup_zone_pageset(zone); 3860 setup_zone_pageset(zone);
3861 } 3861 }
3862 3862
3863 static noinline __init_refok 3863 static noinline __init_refok
3864 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3864 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3865 { 3865 {
3866 int i; 3866 int i;
3867 struct pglist_data *pgdat = zone->zone_pgdat; 3867 struct pglist_data *pgdat = zone->zone_pgdat;
3868 size_t alloc_size; 3868 size_t alloc_size;
3869 3869
3870 /* 3870 /*
3871 * The per-page waitqueue mechanism uses hashed waitqueues 3871 * The per-page waitqueue mechanism uses hashed waitqueues
3872 * per zone. 3872 * per zone.
3873 */ 3873 */
3874 zone->wait_table_hash_nr_entries = 3874 zone->wait_table_hash_nr_entries =
3875 wait_table_hash_nr_entries(zone_size_pages); 3875 wait_table_hash_nr_entries(zone_size_pages);
3876 zone->wait_table_bits = 3876 zone->wait_table_bits =
3877 wait_table_bits(zone->wait_table_hash_nr_entries); 3877 wait_table_bits(zone->wait_table_hash_nr_entries);
3878 alloc_size = zone->wait_table_hash_nr_entries 3878 alloc_size = zone->wait_table_hash_nr_entries
3879 * sizeof(wait_queue_head_t); 3879 * sizeof(wait_queue_head_t);
3880 3880
3881 if (!slab_is_available()) { 3881 if (!slab_is_available()) {
3882 zone->wait_table = (wait_queue_head_t *) 3882 zone->wait_table = (wait_queue_head_t *)
3883 alloc_bootmem_node_nopanic(pgdat, alloc_size); 3883 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3884 } else { 3884 } else {
3885 /* 3885 /*
3886 * This case means that a zone whose size was 0 gets new memory 3886 * This case means that a zone whose size was 0 gets new memory
3887 * via memory hot-add. 3887 * via memory hot-add.
3888 * But it may be the case that a new node was hot-added. In 3888 * But it may be the case that a new node was hot-added. In
3889 * this case vmalloc() will not be able to use this new node's 3889 * this case vmalloc() will not be able to use this new node's
3890 * memory - this wait_table must be initialized to use this new 3890 * memory - this wait_table must be initialized to use this new
3891 * node itself as well. 3891 * node itself as well.
3892 * To use this new node's memory, further consideration will be 3892 * To use this new node's memory, further consideration will be
3893 * necessary. 3893 * necessary.
3894 */ 3894 */
3895 zone->wait_table = vmalloc(alloc_size); 3895 zone->wait_table = vmalloc(alloc_size);
3896 } 3896 }
3897 if (!zone->wait_table) 3897 if (!zone->wait_table)
3898 return -ENOMEM; 3898 return -ENOMEM;
3899 3899
3900 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 3900 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
3901 init_waitqueue_head(zone->wait_table + i); 3901 init_waitqueue_head(zone->wait_table + i);
3902 3902
3903 return 0; 3903 return 0;
3904 } 3904 }
3905 3905
3906 static int __zone_pcp_update(void *data) 3906 static int __zone_pcp_update(void *data)
3907 { 3907 {
3908 struct zone *zone = data; 3908 struct zone *zone = data;
3909 int cpu; 3909 int cpu;
3910 unsigned long batch = zone_batchsize(zone), flags; 3910 unsigned long batch = zone_batchsize(zone), flags;
3911 3911
3912 for_each_possible_cpu(cpu) { 3912 for_each_possible_cpu(cpu) {
3913 struct per_cpu_pageset *pset; 3913 struct per_cpu_pageset *pset;
3914 struct per_cpu_pages *pcp; 3914 struct per_cpu_pages *pcp;
3915 3915
3916 pset = per_cpu_ptr(zone->pageset, cpu); 3916 pset = per_cpu_ptr(zone->pageset, cpu);
3917 pcp = &pset->pcp; 3917 pcp = &pset->pcp;
3918 3918
3919 local_irq_save(flags); 3919 local_irq_save(flags);
3920 if (pcp->count > 0) 3920 if (pcp->count > 0)
3921 free_pcppages_bulk(zone, pcp->count, pcp); 3921 free_pcppages_bulk(zone, pcp->count, pcp);
3922 setup_pageset(pset, batch); 3922 setup_pageset(pset, batch);
3923 local_irq_restore(flags); 3923 local_irq_restore(flags);
3924 } 3924 }
3925 return 0; 3925 return 0;
3926 } 3926 }
3927 3927
3928 void zone_pcp_update(struct zone *zone) 3928 void zone_pcp_update(struct zone *zone)
3929 { 3929 {
3930 stop_machine(__zone_pcp_update, zone, NULL); 3930 stop_machine(__zone_pcp_update, zone, NULL);
3931 } 3931 }
3932 3932
3933 static __meminit void zone_pcp_init(struct zone *zone) 3933 static __meminit void zone_pcp_init(struct zone *zone)
3934 { 3934 {
3935 /* 3935 /*
3936 * per cpu subsystem is not up at this point. The following code 3936 * per cpu subsystem is not up at this point. The following code
3937 * relies on the ability of the linker to provide the 3937 * relies on the ability of the linker to provide the
3938 * offset of a (static) per cpu variable into the per cpu area. 3938 * offset of a (static) per cpu variable into the per cpu area.
3939 */ 3939 */
3940 zone->pageset = &boot_pageset; 3940 zone->pageset = &boot_pageset;
3941 3941
3942 if (zone->present_pages) 3942 if (zone->present_pages)
3943 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 3943 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3944 zone->name, zone->present_pages, 3944 zone->name, zone->present_pages,
3945 zone_batchsize(zone)); 3945 zone_batchsize(zone));
3946 } 3946 }
3947 3947
3948 __meminit int init_currently_empty_zone(struct zone *zone, 3948 __meminit int init_currently_empty_zone(struct zone *zone,
3949 unsigned long zone_start_pfn, 3949 unsigned long zone_start_pfn,
3950 unsigned long size, 3950 unsigned long size,
3951 enum memmap_context context) 3951 enum memmap_context context)
3952 { 3952 {
3953 struct pglist_data *pgdat = zone->zone_pgdat; 3953 struct pglist_data *pgdat = zone->zone_pgdat;
3954 int ret; 3954 int ret;
3955 ret = zone_wait_table_init(zone, size); 3955 ret = zone_wait_table_init(zone, size);
3956 if (ret) 3956 if (ret)
3957 return ret; 3957 return ret;
3958 pgdat->nr_zones = zone_idx(zone) + 1; 3958 pgdat->nr_zones = zone_idx(zone) + 1;
3959 3959
3960 zone->zone_start_pfn = zone_start_pfn; 3960 zone->zone_start_pfn = zone_start_pfn;
3961 3961
3962 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3962 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3963 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 3963 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
3964 pgdat->node_id, 3964 pgdat->node_id,
3965 (unsigned long)zone_idx(zone), 3965 (unsigned long)zone_idx(zone),
3966 zone_start_pfn, (zone_start_pfn + size)); 3966 zone_start_pfn, (zone_start_pfn + size));
3967 3967
3968 zone_init_free_lists(zone); 3968 zone_init_free_lists(zone);
3969 3969
3970 return 0; 3970 return 0;
3971 } 3971 }
3972 3972
3973 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 3973 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
3974 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 3974 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3975 /* 3975 /*
3976 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 3976 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
3977 * Architectures may implement their own version but if add_active_range() 3977 * Architectures may implement their own version but if add_active_range()
3978 * was used and there are no special requirements, this is a convenient 3978 * was used and there are no special requirements, this is a convenient
3979 * alternative 3979 * alternative
3980 */ 3980 */
3981 int __meminit __early_pfn_to_nid(unsigned long pfn) 3981 int __meminit __early_pfn_to_nid(unsigned long pfn)
3982 { 3982 {
3983 unsigned long start_pfn, end_pfn; 3983 unsigned long start_pfn, end_pfn;
3984 int i, nid; 3984 int i, nid;
3985 3985
3986 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 3986 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
3987 if (start_pfn <= pfn && pfn < end_pfn) 3987 if (start_pfn <= pfn && pfn < end_pfn)
3988 return nid; 3988 return nid;
3989 /* This is a memory hole */ 3989 /* This is a memory hole */
3990 return -1; 3990 return -1;
3991 } 3991 }
3992 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 3992 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
3993 3993
3994 int __meminit early_pfn_to_nid(unsigned long pfn) 3994 int __meminit early_pfn_to_nid(unsigned long pfn)
3995 { 3995 {
3996 int nid; 3996 int nid;
3997 3997
3998 nid = __early_pfn_to_nid(pfn); 3998 nid = __early_pfn_to_nid(pfn);
3999 if (nid >= 0) 3999 if (nid >= 0)
4000 return nid; 4000 return nid;
4001 /* just returns 0 */ 4001 /* just returns 0 */
4002 return 0; 4002 return 0;
4003 } 4003 }
4004 4004
4005 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4005 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4006 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4006 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4007 { 4007 {
4008 int nid; 4008 int nid;
4009 4009
4010 nid = __early_pfn_to_nid(pfn); 4010 nid = __early_pfn_to_nid(pfn);
4011 if (nid >= 0 && nid != node) 4011 if (nid >= 0 && nid != node)
4012 return false; 4012 return false;
4013 return true; 4013 return true;
4014 } 4014 }
4015 #endif 4015 #endif
4016 4016
4017 /** 4017 /**
4018 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4018 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4019 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4019 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4020 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4020 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4021 * 4021 *
4022 * If an architecture guarantees that all ranges registered with 4022 * If an architecture guarantees that all ranges registered with
4023 * add_active_ranges() contain no holes and may be freed, this 4023 * add_active_ranges() contain no holes and may be freed, this
4024 * this function may be used instead of calling free_bootmem() manually. 4024 * this function may be used instead of calling free_bootmem() manually.
4025 */ 4025 */
4026 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4026 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4027 { 4027 {
4028 unsigned long start_pfn, end_pfn; 4028 unsigned long start_pfn, end_pfn;
4029 int i, this_nid; 4029 int i, this_nid;
4030 4030
4031 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4031 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4032 start_pfn = min(start_pfn, max_low_pfn); 4032 start_pfn = min(start_pfn, max_low_pfn);
4033 end_pfn = min(end_pfn, max_low_pfn); 4033 end_pfn = min(end_pfn, max_low_pfn);
4034 4034
4035 if (start_pfn < end_pfn) 4035 if (start_pfn < end_pfn)
4036 free_bootmem_node(NODE_DATA(this_nid), 4036 free_bootmem_node(NODE_DATA(this_nid),
4037 PFN_PHYS(start_pfn), 4037 PFN_PHYS(start_pfn),
4038 (end_pfn - start_pfn) << PAGE_SHIFT); 4038 (end_pfn - start_pfn) << PAGE_SHIFT);
4039 } 4039 }
4040 } 4040 }
4041 4041
4042 /** 4042 /**
4043 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4043 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4044 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4044 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4045 * 4045 *
4046 * If an architecture guarantees that all ranges registered with 4046 * If an architecture guarantees that all ranges registered with
4047 * add_active_ranges() contain no holes and may be freed, this 4047 * add_active_ranges() contain no holes and may be freed, this
4048 * function may be used instead of calling memory_present() manually. 4048 * function may be used instead of calling memory_present() manually.
4049 */ 4049 */
4050 void __init sparse_memory_present_with_active_regions(int nid) 4050 void __init sparse_memory_present_with_active_regions(int nid)
4051 { 4051 {
4052 unsigned long start_pfn, end_pfn; 4052 unsigned long start_pfn, end_pfn;
4053 int i, this_nid; 4053 int i, this_nid;
4054 4054
4055 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4055 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4056 memory_present(this_nid, start_pfn, end_pfn); 4056 memory_present(this_nid, start_pfn, end_pfn);
4057 } 4057 }
4058 4058
4059 /** 4059 /**
4060 * get_pfn_range_for_nid - Return the start and end page frames for a node 4060 * get_pfn_range_for_nid - Return the start and end page frames for a node
4061 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4061 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4062 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4062 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4063 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4063 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4064 * 4064 *
4065 * It returns the start and end page frame of a node based on information 4065 * It returns the start and end page frame of a node based on information
4066 * provided by an arch calling add_active_range(). If called for a node 4066 * provided by an arch calling add_active_range(). If called for a node
4067 * with no available memory, a warning is printed and the start and end 4067 * with no available memory, a warning is printed and the start and end
4068 * PFNs will be 0. 4068 * PFNs will be 0.
4069 */ 4069 */
4070 void __meminit get_pfn_range_for_nid(unsigned int nid, 4070 void __meminit get_pfn_range_for_nid(unsigned int nid,
4071 unsigned long *start_pfn, unsigned long *end_pfn) 4071 unsigned long *start_pfn, unsigned long *end_pfn)
4072 { 4072 {
4073 unsigned long this_start_pfn, this_end_pfn; 4073 unsigned long this_start_pfn, this_end_pfn;
4074 int i; 4074 int i;
4075 4075
4076 *start_pfn = -1UL; 4076 *start_pfn = -1UL;
4077 *end_pfn = 0; 4077 *end_pfn = 0;
4078 4078
4079 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4079 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4080 *start_pfn = min(*start_pfn, this_start_pfn); 4080 *start_pfn = min(*start_pfn, this_start_pfn);
4081 *end_pfn = max(*end_pfn, this_end_pfn); 4081 *end_pfn = max(*end_pfn, this_end_pfn);
4082 } 4082 }
4083 4083
4084 if (*start_pfn == -1UL) 4084 if (*start_pfn == -1UL)
4085 *start_pfn = 0; 4085 *start_pfn = 0;
4086 } 4086 }
4087 4087
4088 /* 4088 /*
4089 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4089 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4090 * assumption is made that zones within a node are ordered in monotonic 4090 * assumption is made that zones within a node are ordered in monotonic
4091 * increasing memory addresses so that the "highest" populated zone is used 4091 * increasing memory addresses so that the "highest" populated zone is used
4092 */ 4092 */
4093 static void __init find_usable_zone_for_movable(void) 4093 static void __init find_usable_zone_for_movable(void)
4094 { 4094 {
4095 int zone_index; 4095 int zone_index;
4096 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4096 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4097 if (zone_index == ZONE_MOVABLE) 4097 if (zone_index == ZONE_MOVABLE)
4098 continue; 4098 continue;
4099 4099
4100 if (arch_zone_highest_possible_pfn[zone_index] > 4100 if (arch_zone_highest_possible_pfn[zone_index] >
4101 arch_zone_lowest_possible_pfn[zone_index]) 4101 arch_zone_lowest_possible_pfn[zone_index])
4102 break; 4102 break;
4103 } 4103 }
4104 4104
4105 VM_BUG_ON(zone_index == -1); 4105 VM_BUG_ON(zone_index == -1);
4106 movable_zone = zone_index; 4106 movable_zone = zone_index;
4107 } 4107 }
4108 4108
4109 /* 4109 /*
4110 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4110 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4111 * because it is sized independent of architecture. Unlike the other zones, 4111 * because it is sized independent of architecture. Unlike the other zones,
4112 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4112 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4113 * in each node depending on the size of each node and how evenly kernelcore 4113 * in each node depending on the size of each node and how evenly kernelcore
4114 * is distributed. This helper function adjusts the zone ranges 4114 * is distributed. This helper function adjusts the zone ranges
4115 * provided by the architecture for a given node by using the end of the 4115 * provided by the architecture for a given node by using the end of the
4116 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4116 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4117 * zones within a node are in order of monotonic increases memory addresses 4117 * zones within a node are in order of monotonic increases memory addresses
4118 */ 4118 */
4119 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4119 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4120 unsigned long zone_type, 4120 unsigned long zone_type,
4121 unsigned long node_start_pfn, 4121 unsigned long node_start_pfn,
4122 unsigned long node_end_pfn, 4122 unsigned long node_end_pfn,
4123 unsigned long *zone_start_pfn, 4123 unsigned long *zone_start_pfn,
4124 unsigned long *zone_end_pfn) 4124 unsigned long *zone_end_pfn)
4125 { 4125 {
4126 /* Only adjust if ZONE_MOVABLE is on this node */ 4126 /* Only adjust if ZONE_MOVABLE is on this node */
4127 if (zone_movable_pfn[nid]) { 4127 if (zone_movable_pfn[nid]) {
4128 /* Size ZONE_MOVABLE */ 4128 /* Size ZONE_MOVABLE */
4129 if (zone_type == ZONE_MOVABLE) { 4129 if (zone_type == ZONE_MOVABLE) {
4130 *zone_start_pfn = zone_movable_pfn[nid]; 4130 *zone_start_pfn = zone_movable_pfn[nid];
4131 *zone_end_pfn = min(node_end_pfn, 4131 *zone_end_pfn = min(node_end_pfn,
4132 arch_zone_highest_possible_pfn[movable_zone]); 4132 arch_zone_highest_possible_pfn[movable_zone]);
4133 4133
4134 /* Adjust for ZONE_MOVABLE starting within this range */ 4134 /* Adjust for ZONE_MOVABLE starting within this range */
4135 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4135 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4136 *zone_end_pfn > zone_movable_pfn[nid]) { 4136 *zone_end_pfn > zone_movable_pfn[nid]) {
4137 *zone_end_pfn = zone_movable_pfn[nid]; 4137 *zone_end_pfn = zone_movable_pfn[nid];
4138 4138
4139 /* Check if this whole range is within ZONE_MOVABLE */ 4139 /* Check if this whole range is within ZONE_MOVABLE */
4140 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4140 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4141 *zone_start_pfn = *zone_end_pfn; 4141 *zone_start_pfn = *zone_end_pfn;
4142 } 4142 }
4143 } 4143 }
4144 4144
4145 /* 4145 /*
4146 * Return the number of pages a zone spans in a node, including holes 4146 * Return the number of pages a zone spans in a node, including holes
4147 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4147 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4148 */ 4148 */
4149 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4149 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4150 unsigned long zone_type, 4150 unsigned long zone_type,
4151 unsigned long *ignored) 4151 unsigned long *ignored)
4152 { 4152 {
4153 unsigned long node_start_pfn, node_end_pfn; 4153 unsigned long node_start_pfn, node_end_pfn;
4154 unsigned long zone_start_pfn, zone_end_pfn; 4154 unsigned long zone_start_pfn, zone_end_pfn;
4155 4155
4156 /* Get the start and end of the node and zone */ 4156 /* Get the start and end of the node and zone */
4157 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4157 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4158 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4158 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4159 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4159 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4160 adjust_zone_range_for_zone_movable(nid, zone_type, 4160 adjust_zone_range_for_zone_movable(nid, zone_type,
4161 node_start_pfn, node_end_pfn, 4161 node_start_pfn, node_end_pfn,
4162 &zone_start_pfn, &zone_end_pfn); 4162 &zone_start_pfn, &zone_end_pfn);
4163 4163
4164 /* Check that this node has pages within the zone's required range */ 4164 /* Check that this node has pages within the zone's required range */
4165 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4165 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4166 return 0; 4166 return 0;
4167 4167
4168 /* Move the zone boundaries inside the node if necessary */ 4168 /* Move the zone boundaries inside the node if necessary */
4169 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4169 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4170 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4170 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4171 4171
4172 /* Return the spanned pages */ 4172 /* Return the spanned pages */
4173 return zone_end_pfn - zone_start_pfn; 4173 return zone_end_pfn - zone_start_pfn;
4174 } 4174 }
4175 4175
4176 /* 4176 /*
4177 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4177 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4178 * then all holes in the requested range will be accounted for. 4178 * then all holes in the requested range will be accounted for.
4179 */ 4179 */
4180 unsigned long __meminit __absent_pages_in_range(int nid, 4180 unsigned long __meminit __absent_pages_in_range(int nid,
4181 unsigned long range_start_pfn, 4181 unsigned long range_start_pfn,
4182 unsigned long range_end_pfn) 4182 unsigned long range_end_pfn)
4183 { 4183 {
4184 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4184 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4185 unsigned long start_pfn, end_pfn; 4185 unsigned long start_pfn, end_pfn;
4186 int i; 4186 int i;
4187 4187
4188 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4188 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4189 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4189 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4190 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4190 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4191 nr_absent -= end_pfn - start_pfn; 4191 nr_absent -= end_pfn - start_pfn;
4192 } 4192 }
4193 return nr_absent; 4193 return nr_absent;
4194 } 4194 }
4195 4195
4196 /** 4196 /**
4197 * absent_pages_in_range - Return number of page frames in holes within a range 4197 * absent_pages_in_range - Return number of page frames in holes within a range
4198 * @start_pfn: The start PFN to start searching for holes 4198 * @start_pfn: The start PFN to start searching for holes
4199 * @end_pfn: The end PFN to stop searching for holes 4199 * @end_pfn: The end PFN to stop searching for holes
4200 * 4200 *
4201 * It returns the number of pages frames in memory holes within a range. 4201 * It returns the number of pages frames in memory holes within a range.
4202 */ 4202 */
4203 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4203 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4204 unsigned long end_pfn) 4204 unsigned long end_pfn)
4205 { 4205 {
4206 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4206 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4207 } 4207 }
4208 4208
4209 /* Return the number of page frames in holes in a zone on a node */ 4209 /* Return the number of page frames in holes in a zone on a node */
4210 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4210 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4211 unsigned long zone_type, 4211 unsigned long zone_type,
4212 unsigned long *ignored) 4212 unsigned long *ignored)
4213 { 4213 {
4214 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4214 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4215 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4215 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4216 unsigned long node_start_pfn, node_end_pfn; 4216 unsigned long node_start_pfn, node_end_pfn;
4217 unsigned long zone_start_pfn, zone_end_pfn; 4217 unsigned long zone_start_pfn, zone_end_pfn;
4218 4218
4219 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4219 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4220 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4220 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4221 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4221 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4222 4222
4223 adjust_zone_range_for_zone_movable(nid, zone_type, 4223 adjust_zone_range_for_zone_movable(nid, zone_type,
4224 node_start_pfn, node_end_pfn, 4224 node_start_pfn, node_end_pfn,
4225 &zone_start_pfn, &zone_end_pfn); 4225 &zone_start_pfn, &zone_end_pfn);
4226 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4226 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4227 } 4227 }
4228 4228
4229 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4229 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4230 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4230 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4231 unsigned long zone_type, 4231 unsigned long zone_type,
4232 unsigned long *zones_size) 4232 unsigned long *zones_size)
4233 { 4233 {
4234 return zones_size[zone_type]; 4234 return zones_size[zone_type];
4235 } 4235 }
4236 4236
4237 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4237 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4238 unsigned long zone_type, 4238 unsigned long zone_type,
4239 unsigned long *zholes_size) 4239 unsigned long *zholes_size)
4240 { 4240 {
4241 if (!zholes_size) 4241 if (!zholes_size)
4242 return 0; 4242 return 0;
4243 4243
4244 return zholes_size[zone_type]; 4244 return zholes_size[zone_type];
4245 } 4245 }
4246 4246
4247 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4247 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4248 4248
4249 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4249 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4250 unsigned long *zones_size, unsigned long *zholes_size) 4250 unsigned long *zones_size, unsigned long *zholes_size)
4251 { 4251 {
4252 unsigned long realtotalpages, totalpages = 0; 4252 unsigned long realtotalpages, totalpages = 0;
4253 enum zone_type i; 4253 enum zone_type i;
4254 4254
4255 for (i = 0; i < MAX_NR_ZONES; i++) 4255 for (i = 0; i < MAX_NR_ZONES; i++)
4256 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4256 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4257 zones_size); 4257 zones_size);
4258 pgdat->node_spanned_pages = totalpages; 4258 pgdat->node_spanned_pages = totalpages;
4259 4259
4260 realtotalpages = totalpages; 4260 realtotalpages = totalpages;
4261 for (i = 0; i < MAX_NR_ZONES; i++) 4261 for (i = 0; i < MAX_NR_ZONES; i++)
4262 realtotalpages -= 4262 realtotalpages -=
4263 zone_absent_pages_in_node(pgdat->node_id, i, 4263 zone_absent_pages_in_node(pgdat->node_id, i,
4264 zholes_size); 4264 zholes_size);
4265 pgdat->node_present_pages = realtotalpages; 4265 pgdat->node_present_pages = realtotalpages;
4266 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4266 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4267 realtotalpages); 4267 realtotalpages);
4268 } 4268 }
4269 4269
4270 #ifndef CONFIG_SPARSEMEM 4270 #ifndef CONFIG_SPARSEMEM
4271 /* 4271 /*
4272 * Calculate the size of the zone->blockflags rounded to an unsigned long 4272 * Calculate the size of the zone->blockflags rounded to an unsigned long
4273 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4273 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4274 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4274 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4275 * round what is now in bits to nearest long in bits, then return it in 4275 * round what is now in bits to nearest long in bits, then return it in
4276 * bytes. 4276 * bytes.
4277 */ 4277 */
4278 static unsigned long __init usemap_size(unsigned long zonesize) 4278 static unsigned long __init usemap_size(unsigned long zonesize)
4279 { 4279 {
4280 unsigned long usemapsize; 4280 unsigned long usemapsize;
4281 4281
4282 usemapsize = roundup(zonesize, pageblock_nr_pages); 4282 usemapsize = roundup(zonesize, pageblock_nr_pages);
4283 usemapsize = usemapsize >> pageblock_order; 4283 usemapsize = usemapsize >> pageblock_order;
4284 usemapsize *= NR_PAGEBLOCK_BITS; 4284 usemapsize *= NR_PAGEBLOCK_BITS;
4285 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4285 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4286 4286
4287 return usemapsize / 8; 4287 return usemapsize / 8;
4288 } 4288 }
4289 4289
4290 static void __init setup_usemap(struct pglist_data *pgdat, 4290 static void __init setup_usemap(struct pglist_data *pgdat,
4291 struct zone *zone, unsigned long zonesize) 4291 struct zone *zone, unsigned long zonesize)
4292 { 4292 {
4293 unsigned long usemapsize = usemap_size(zonesize); 4293 unsigned long usemapsize = usemap_size(zonesize);
4294 zone->pageblock_flags = NULL; 4294 zone->pageblock_flags = NULL;
4295 if (usemapsize) 4295 if (usemapsize)
4296 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4296 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4297 usemapsize); 4297 usemapsize);
4298 } 4298 }
4299 #else 4299 #else
4300 static inline void setup_usemap(struct pglist_data *pgdat, 4300 static inline void setup_usemap(struct pglist_data *pgdat,
4301 struct zone *zone, unsigned long zonesize) {} 4301 struct zone *zone, unsigned long zonesize) {}
4302 #endif /* CONFIG_SPARSEMEM */ 4302 #endif /* CONFIG_SPARSEMEM */
4303 4303
4304 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4304 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4305 4305
4306 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4306 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4307 static inline void __init set_pageblock_order(void) 4307 static inline void __init set_pageblock_order(void)
4308 { 4308 {
4309 unsigned int order; 4309 unsigned int order;
4310 4310
4311 /* Check that pageblock_nr_pages has not already been setup */ 4311 /* Check that pageblock_nr_pages has not already been setup */
4312 if (pageblock_order) 4312 if (pageblock_order)
4313 return; 4313 return;
4314 4314
4315 if (HPAGE_SHIFT > PAGE_SHIFT) 4315 if (HPAGE_SHIFT > PAGE_SHIFT)
4316 order = HUGETLB_PAGE_ORDER; 4316 order = HUGETLB_PAGE_ORDER;
4317 else 4317 else
4318 order = MAX_ORDER - 1; 4318 order = MAX_ORDER - 1;
4319 4319
4320 /* 4320 /*
4321 * Assume the largest contiguous order of interest is a huge page. 4321 * Assume the largest contiguous order of interest is a huge page.
4322 * This value may be variable depending on boot parameters on IA64 and 4322 * This value may be variable depending on boot parameters on IA64 and
4323 * powerpc. 4323 * powerpc.
4324 */ 4324 */
4325 pageblock_order = order; 4325 pageblock_order = order;
4326 } 4326 }
4327 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4327 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4328 4328
4329 /* 4329 /*
4330 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4330 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4331 * is unused as pageblock_order is set at compile-time. See 4331 * is unused as pageblock_order is set at compile-time. See
4332 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4332 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4333 * the kernel config 4333 * the kernel config
4334 */ 4334 */
4335 static inline void set_pageblock_order(void) 4335 static inline void set_pageblock_order(void)
4336 { 4336 {
4337 } 4337 }
4338 4338
4339 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4339 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4340 4340
4341 /* 4341 /*
4342 * Set up the zone data structures: 4342 * Set up the zone data structures:
4343 * - mark all pages reserved 4343 * - mark all pages reserved
4344 * - mark all memory queues empty 4344 * - mark all memory queues empty
4345 * - clear the memory bitmaps 4345 * - clear the memory bitmaps
4346 */ 4346 */
4347 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4347 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4348 unsigned long *zones_size, unsigned long *zholes_size) 4348 unsigned long *zones_size, unsigned long *zholes_size)
4349 { 4349 {
4350 enum zone_type j; 4350 enum zone_type j;
4351 int nid = pgdat->node_id; 4351 int nid = pgdat->node_id;
4352 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4352 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4353 int ret; 4353 int ret;
4354 4354
4355 pgdat_resize_init(pgdat); 4355 pgdat_resize_init(pgdat);
4356 pgdat->nr_zones = 0; 4356 pgdat->nr_zones = 0;
4357 init_waitqueue_head(&pgdat->kswapd_wait); 4357 init_waitqueue_head(&pgdat->kswapd_wait);
4358 pgdat->kswapd_max_order = 0; 4358 pgdat->kswapd_max_order = 0;
4359 pgdat_page_cgroup_init(pgdat); 4359 pgdat_page_cgroup_init(pgdat);
4360 4360
4361 for (j = 0; j < MAX_NR_ZONES; j++) { 4361 for (j = 0; j < MAX_NR_ZONES; j++) {
4362 struct zone *zone = pgdat->node_zones + j; 4362 struct zone *zone = pgdat->node_zones + j;
4363 unsigned long size, realsize, memmap_pages; 4363 unsigned long size, realsize, memmap_pages;
4364 4364
4365 size = zone_spanned_pages_in_node(nid, j, zones_size); 4365 size = zone_spanned_pages_in_node(nid, j, zones_size);
4366 realsize = size - zone_absent_pages_in_node(nid, j, 4366 realsize = size - zone_absent_pages_in_node(nid, j,
4367 zholes_size); 4367 zholes_size);
4368 4368
4369 /* 4369 /*
4370 * Adjust realsize so that it accounts for how much memory 4370 * Adjust realsize so that it accounts for how much memory
4371 * is used by this zone for memmap. This affects the watermark 4371 * is used by this zone for memmap. This affects the watermark
4372 * and per-cpu initialisations 4372 * and per-cpu initialisations
4373 */ 4373 */
4374 memmap_pages = 4374 memmap_pages =
4375 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4375 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4376 if (realsize >= memmap_pages) { 4376 if (realsize >= memmap_pages) {
4377 realsize -= memmap_pages; 4377 realsize -= memmap_pages;
4378 if (memmap_pages) 4378 if (memmap_pages)
4379 printk(KERN_DEBUG 4379 printk(KERN_DEBUG
4380 " %s zone: %lu pages used for memmap\n", 4380 " %s zone: %lu pages used for memmap\n",
4381 zone_names[j], memmap_pages); 4381 zone_names[j], memmap_pages);
4382 } else 4382 } else
4383 printk(KERN_WARNING 4383 printk(KERN_WARNING
4384 " %s zone: %lu pages exceeds realsize %lu\n", 4384 " %s zone: %lu pages exceeds realsize %lu\n",
4385 zone_names[j], memmap_pages, realsize); 4385 zone_names[j], memmap_pages, realsize);
4386 4386
4387 /* Account for reserved pages */ 4387 /* Account for reserved pages */
4388 if (j == 0 && realsize > dma_reserve) { 4388 if (j == 0 && realsize > dma_reserve) {
4389 realsize -= dma_reserve; 4389 realsize -= dma_reserve;
4390 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4390 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4391 zone_names[0], dma_reserve); 4391 zone_names[0], dma_reserve);
4392 } 4392 }
4393 4393
4394 if (!is_highmem_idx(j)) 4394 if (!is_highmem_idx(j))
4395 nr_kernel_pages += realsize; 4395 nr_kernel_pages += realsize;
4396 nr_all_pages += realsize; 4396 nr_all_pages += realsize;
4397 4397
4398 zone->spanned_pages = size; 4398 zone->spanned_pages = size;
4399 zone->present_pages = realsize; 4399 zone->present_pages = realsize;
4400 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
4401 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4402 zone->spanned_pages;
4403 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4404 #endif
4400 #ifdef CONFIG_NUMA 4405 #ifdef CONFIG_NUMA
4401 zone->node = nid; 4406 zone->node = nid;
4402 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4407 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
4403 / 100; 4408 / 100;
4404 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4409 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
4405 #endif 4410 #endif
4406 zone->name = zone_names[j]; 4411 zone->name = zone_names[j];
4407 spin_lock_init(&zone->lock); 4412 spin_lock_init(&zone->lock);
4408 spin_lock_init(&zone->lru_lock); 4413 spin_lock_init(&zone->lru_lock);
4409 zone_seqlock_init(zone); 4414 zone_seqlock_init(zone);
4410 zone->zone_pgdat = pgdat; 4415 zone->zone_pgdat = pgdat;
4411 4416
4412 zone_pcp_init(zone); 4417 zone_pcp_init(zone);
4413 lruvec_init(&zone->lruvec, zone); 4418 lruvec_init(&zone->lruvec, zone);
4414 zap_zone_vm_stats(zone); 4419 zap_zone_vm_stats(zone);
4415 zone->flags = 0; 4420 zone->flags = 0;
4416 if (!size) 4421 if (!size)
4417 continue; 4422 continue;
4418 4423
4419 set_pageblock_order(); 4424 set_pageblock_order();
4420 setup_usemap(pgdat, zone, size); 4425 setup_usemap(pgdat, zone, size);
4421 ret = init_currently_empty_zone(zone, zone_start_pfn, 4426 ret = init_currently_empty_zone(zone, zone_start_pfn,
4422 size, MEMMAP_EARLY); 4427 size, MEMMAP_EARLY);
4423 BUG_ON(ret); 4428 BUG_ON(ret);
4424 memmap_init(size, nid, j, zone_start_pfn); 4429 memmap_init(size, nid, j, zone_start_pfn);
4425 zone_start_pfn += size; 4430 zone_start_pfn += size;
4426 } 4431 }
4427 } 4432 }
4428 4433
4429 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4434 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4430 { 4435 {
4431 /* Skip empty nodes */ 4436 /* Skip empty nodes */
4432 if (!pgdat->node_spanned_pages) 4437 if (!pgdat->node_spanned_pages)
4433 return; 4438 return;
4434 4439
4435 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4440 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4436 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4441 /* ia64 gets its own node_mem_map, before this, without bootmem */
4437 if (!pgdat->node_mem_map) { 4442 if (!pgdat->node_mem_map) {
4438 unsigned long size, start, end; 4443 unsigned long size, start, end;
4439 struct page *map; 4444 struct page *map;
4440 4445
4441 /* 4446 /*
4442 * The zone's endpoints aren't required to be MAX_ORDER 4447 * The zone's endpoints aren't required to be MAX_ORDER
4443 * aligned but the node_mem_map endpoints must be in order 4448 * aligned but the node_mem_map endpoints must be in order
4444 * for the buddy allocator to function correctly. 4449 * for the buddy allocator to function correctly.
4445 */ 4450 */
4446 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4451 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4447 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4452 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
4448 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4453 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4449 size = (end - start) * sizeof(struct page); 4454 size = (end - start) * sizeof(struct page);
4450 map = alloc_remap(pgdat->node_id, size); 4455 map = alloc_remap(pgdat->node_id, size);
4451 if (!map) 4456 if (!map)
4452 map = alloc_bootmem_node_nopanic(pgdat, size); 4457 map = alloc_bootmem_node_nopanic(pgdat, size);
4453 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4458 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4454 } 4459 }
4455 #ifndef CONFIG_NEED_MULTIPLE_NODES 4460 #ifndef CONFIG_NEED_MULTIPLE_NODES
4456 /* 4461 /*
4457 * With no DISCONTIG, the global mem_map is just set as node 0's 4462 * With no DISCONTIG, the global mem_map is just set as node 0's
4458 */ 4463 */
4459 if (pgdat == NODE_DATA(0)) { 4464 if (pgdat == NODE_DATA(0)) {
4460 mem_map = NODE_DATA(0)->node_mem_map; 4465 mem_map = NODE_DATA(0)->node_mem_map;
4461 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4466 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4462 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4467 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4463 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4468 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4464 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4469 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4465 } 4470 }
4466 #endif 4471 #endif
4467 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4472 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4468 } 4473 }
4469 4474
4470 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4475 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4471 unsigned long node_start_pfn, unsigned long *zholes_size) 4476 unsigned long node_start_pfn, unsigned long *zholes_size)
4472 { 4477 {
4473 pg_data_t *pgdat = NODE_DATA(nid); 4478 pg_data_t *pgdat = NODE_DATA(nid);
4474 4479
4475 pgdat->node_id = nid; 4480 pgdat->node_id = nid;
4476 pgdat->node_start_pfn = node_start_pfn; 4481 pgdat->node_start_pfn = node_start_pfn;
4477 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4482 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4478 4483
4479 alloc_node_mem_map(pgdat); 4484 alloc_node_mem_map(pgdat);
4480 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4485 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4481 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4486 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4482 nid, (unsigned long)pgdat, 4487 nid, (unsigned long)pgdat,
4483 (unsigned long)pgdat->node_mem_map); 4488 (unsigned long)pgdat->node_mem_map);
4484 #endif 4489 #endif
4485 4490
4486 free_area_init_core(pgdat, zones_size, zholes_size); 4491 free_area_init_core(pgdat, zones_size, zholes_size);
4487 } 4492 }
4488 4493
4489 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4494 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4490 4495
4491 #if MAX_NUMNODES > 1 4496 #if MAX_NUMNODES > 1
4492 /* 4497 /*
4493 * Figure out the number of possible node ids. 4498 * Figure out the number of possible node ids.
4494 */ 4499 */
4495 static void __init setup_nr_node_ids(void) 4500 static void __init setup_nr_node_ids(void)
4496 { 4501 {
4497 unsigned int node; 4502 unsigned int node;
4498 unsigned int highest = 0; 4503 unsigned int highest = 0;
4499 4504
4500 for_each_node_mask(node, node_possible_map) 4505 for_each_node_mask(node, node_possible_map)
4501 highest = node; 4506 highest = node;
4502 nr_node_ids = highest + 1; 4507 nr_node_ids = highest + 1;
4503 } 4508 }
4504 #else 4509 #else
4505 static inline void setup_nr_node_ids(void) 4510 static inline void setup_nr_node_ids(void)
4506 { 4511 {
4507 } 4512 }
4508 #endif 4513 #endif
4509 4514
4510 /** 4515 /**
4511 * node_map_pfn_alignment - determine the maximum internode alignment 4516 * node_map_pfn_alignment - determine the maximum internode alignment
4512 * 4517 *
4513 * This function should be called after node map is populated and sorted. 4518 * This function should be called after node map is populated and sorted.
4514 * It calculates the maximum power of two alignment which can distinguish 4519 * It calculates the maximum power of two alignment which can distinguish
4515 * all the nodes. 4520 * all the nodes.
4516 * 4521 *
4517 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4522 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4518 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4523 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4519 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4524 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4520 * shifted, 1GiB is enough and this function will indicate so. 4525 * shifted, 1GiB is enough and this function will indicate so.
4521 * 4526 *
4522 * This is used to test whether pfn -> nid mapping of the chosen memory 4527 * This is used to test whether pfn -> nid mapping of the chosen memory
4523 * model has fine enough granularity to avoid incorrect mapping for the 4528 * model has fine enough granularity to avoid incorrect mapping for the
4524 * populated node map. 4529 * populated node map.
4525 * 4530 *
4526 * Returns the determined alignment in pfn's. 0 if there is no alignment 4531 * Returns the determined alignment in pfn's. 0 if there is no alignment
4527 * requirement (single node). 4532 * requirement (single node).
4528 */ 4533 */
4529 unsigned long __init node_map_pfn_alignment(void) 4534 unsigned long __init node_map_pfn_alignment(void)
4530 { 4535 {
4531 unsigned long accl_mask = 0, last_end = 0; 4536 unsigned long accl_mask = 0, last_end = 0;
4532 unsigned long start, end, mask; 4537 unsigned long start, end, mask;
4533 int last_nid = -1; 4538 int last_nid = -1;
4534 int i, nid; 4539 int i, nid;
4535 4540
4536 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4541 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4537 if (!start || last_nid < 0 || last_nid == nid) { 4542 if (!start || last_nid < 0 || last_nid == nid) {
4538 last_nid = nid; 4543 last_nid = nid;
4539 last_end = end; 4544 last_end = end;
4540 continue; 4545 continue;
4541 } 4546 }
4542 4547
4543 /* 4548 /*
4544 * Start with a mask granular enough to pin-point to the 4549 * Start with a mask granular enough to pin-point to the
4545 * start pfn and tick off bits one-by-one until it becomes 4550 * start pfn and tick off bits one-by-one until it becomes
4546 * too coarse to separate the current node from the last. 4551 * too coarse to separate the current node from the last.
4547 */ 4552 */
4548 mask = ~((1 << __ffs(start)) - 1); 4553 mask = ~((1 << __ffs(start)) - 1);
4549 while (mask && last_end <= (start & (mask << 1))) 4554 while (mask && last_end <= (start & (mask << 1)))
4550 mask <<= 1; 4555 mask <<= 1;
4551 4556
4552 /* accumulate all internode masks */ 4557 /* accumulate all internode masks */
4553 accl_mask |= mask; 4558 accl_mask |= mask;
4554 } 4559 }
4555 4560
4556 /* convert mask to number of pages */ 4561 /* convert mask to number of pages */
4557 return ~accl_mask + 1; 4562 return ~accl_mask + 1;
4558 } 4563 }
4559 4564
4560 /* Find the lowest pfn for a node */ 4565 /* Find the lowest pfn for a node */
4561 static unsigned long __init find_min_pfn_for_node(int nid) 4566 static unsigned long __init find_min_pfn_for_node(int nid)
4562 { 4567 {
4563 unsigned long min_pfn = ULONG_MAX; 4568 unsigned long min_pfn = ULONG_MAX;
4564 unsigned long start_pfn; 4569 unsigned long start_pfn;
4565 int i; 4570 int i;
4566 4571
4567 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 4572 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4568 min_pfn = min(min_pfn, start_pfn); 4573 min_pfn = min(min_pfn, start_pfn);
4569 4574
4570 if (min_pfn == ULONG_MAX) { 4575 if (min_pfn == ULONG_MAX) {
4571 printk(KERN_WARNING 4576 printk(KERN_WARNING
4572 "Could not find start_pfn for node %d\n", nid); 4577 "Could not find start_pfn for node %d\n", nid);
4573 return 0; 4578 return 0;
4574 } 4579 }
4575 4580
4576 return min_pfn; 4581 return min_pfn;
4577 } 4582 }
4578 4583
4579 /** 4584 /**
4580 * find_min_pfn_with_active_regions - Find the minimum PFN registered 4585 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4581 * 4586 *
4582 * It returns the minimum PFN based on information provided via 4587 * It returns the minimum PFN based on information provided via
4583 * add_active_range(). 4588 * add_active_range().
4584 */ 4589 */
4585 unsigned long __init find_min_pfn_with_active_regions(void) 4590 unsigned long __init find_min_pfn_with_active_regions(void)
4586 { 4591 {
4587 return find_min_pfn_for_node(MAX_NUMNODES); 4592 return find_min_pfn_for_node(MAX_NUMNODES);
4588 } 4593 }
4589 4594
4590 /* 4595 /*
4591 * early_calculate_totalpages() 4596 * early_calculate_totalpages()
4592 * Sum pages in active regions for movable zone. 4597 * Sum pages in active regions for movable zone.
4593 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4598 * Populate N_HIGH_MEMORY for calculating usable_nodes.
4594 */ 4599 */
4595 static unsigned long __init early_calculate_totalpages(void) 4600 static unsigned long __init early_calculate_totalpages(void)
4596 { 4601 {
4597 unsigned long totalpages = 0; 4602 unsigned long totalpages = 0;
4598 unsigned long start_pfn, end_pfn; 4603 unsigned long start_pfn, end_pfn;
4599 int i, nid; 4604 int i, nid;
4600 4605
4601 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 4606 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4602 unsigned long pages = end_pfn - start_pfn; 4607 unsigned long pages = end_pfn - start_pfn;
4603 4608
4604 totalpages += pages; 4609 totalpages += pages;
4605 if (pages) 4610 if (pages)
4606 node_set_state(nid, N_HIGH_MEMORY); 4611 node_set_state(nid, N_HIGH_MEMORY);
4607 } 4612 }
4608 return totalpages; 4613 return totalpages;
4609 } 4614 }
4610 4615
4611 /* 4616 /*
4612 * Find the PFN the Movable zone begins in each node. Kernel memory 4617 * Find the PFN the Movable zone begins in each node. Kernel memory
4613 * is spread evenly between nodes as long as the nodes have enough 4618 * is spread evenly between nodes as long as the nodes have enough
4614 * memory. When they don't, some nodes will have more kernelcore than 4619 * memory. When they don't, some nodes will have more kernelcore than
4615 * others 4620 * others
4616 */ 4621 */
4617 static void __init find_zone_movable_pfns_for_nodes(void) 4622 static void __init find_zone_movable_pfns_for_nodes(void)
4618 { 4623 {
4619 int i, nid; 4624 int i, nid;
4620 unsigned long usable_startpfn; 4625 unsigned long usable_startpfn;
4621 unsigned long kernelcore_node, kernelcore_remaining; 4626 unsigned long kernelcore_node, kernelcore_remaining;
4622 /* save the state before borrow the nodemask */ 4627 /* save the state before borrow the nodemask */
4623 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4628 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4624 unsigned long totalpages = early_calculate_totalpages(); 4629 unsigned long totalpages = early_calculate_totalpages();
4625 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4630 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4626 4631
4627 /* 4632 /*
4628 * If movablecore was specified, calculate what size of 4633 * If movablecore was specified, calculate what size of
4629 * kernelcore that corresponds so that memory usable for 4634 * kernelcore that corresponds so that memory usable for
4630 * any allocation type is evenly spread. If both kernelcore 4635 * any allocation type is evenly spread. If both kernelcore
4631 * and movablecore are specified, then the value of kernelcore 4636 * and movablecore are specified, then the value of kernelcore
4632 * will be used for required_kernelcore if it's greater than 4637 * will be used for required_kernelcore if it's greater than
4633 * what movablecore would have allowed. 4638 * what movablecore would have allowed.
4634 */ 4639 */
4635 if (required_movablecore) { 4640 if (required_movablecore) {
4636 unsigned long corepages; 4641 unsigned long corepages;
4637 4642
4638 /* 4643 /*
4639 * Round-up so that ZONE_MOVABLE is at least as large as what 4644 * Round-up so that ZONE_MOVABLE is at least as large as what
4640 * was requested by the user 4645 * was requested by the user
4641 */ 4646 */
4642 required_movablecore = 4647 required_movablecore =
4643 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 4648 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4644 corepages = totalpages - required_movablecore; 4649 corepages = totalpages - required_movablecore;
4645 4650
4646 required_kernelcore = max(required_kernelcore, corepages); 4651 required_kernelcore = max(required_kernelcore, corepages);
4647 } 4652 }
4648 4653
4649 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4654 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4650 if (!required_kernelcore) 4655 if (!required_kernelcore)
4651 goto out; 4656 goto out;
4652 4657
4653 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4658 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4654 find_usable_zone_for_movable(); 4659 find_usable_zone_for_movable();
4655 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4660 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4656 4661
4657 restart: 4662 restart:
4658 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4663 /* Spread kernelcore memory as evenly as possible throughout nodes */
4659 kernelcore_node = required_kernelcore / usable_nodes; 4664 kernelcore_node = required_kernelcore / usable_nodes;
4660 for_each_node_state(nid, N_HIGH_MEMORY) { 4665 for_each_node_state(nid, N_HIGH_MEMORY) {
4661 unsigned long start_pfn, end_pfn; 4666 unsigned long start_pfn, end_pfn;
4662 4667
4663 /* 4668 /*
4664 * Recalculate kernelcore_node if the division per node 4669 * Recalculate kernelcore_node if the division per node
4665 * now exceeds what is necessary to satisfy the requested 4670 * now exceeds what is necessary to satisfy the requested
4666 * amount of memory for the kernel 4671 * amount of memory for the kernel
4667 */ 4672 */
4668 if (required_kernelcore < kernelcore_node) 4673 if (required_kernelcore < kernelcore_node)
4669 kernelcore_node = required_kernelcore / usable_nodes; 4674 kernelcore_node = required_kernelcore / usable_nodes;
4670 4675
4671 /* 4676 /*
4672 * As the map is walked, we track how much memory is usable 4677 * As the map is walked, we track how much memory is usable
4673 * by the kernel using kernelcore_remaining. When it is 4678 * by the kernel using kernelcore_remaining. When it is
4674 * 0, the rest of the node is usable by ZONE_MOVABLE 4679 * 0, the rest of the node is usable by ZONE_MOVABLE
4675 */ 4680 */
4676 kernelcore_remaining = kernelcore_node; 4681 kernelcore_remaining = kernelcore_node;
4677 4682
4678 /* Go through each range of PFNs within this node */ 4683 /* Go through each range of PFNs within this node */
4679 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4684 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4680 unsigned long size_pages; 4685 unsigned long size_pages;
4681 4686
4682 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4687 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4683 if (start_pfn >= end_pfn) 4688 if (start_pfn >= end_pfn)
4684 continue; 4689 continue;
4685 4690
4686 /* Account for what is only usable for kernelcore */ 4691 /* Account for what is only usable for kernelcore */
4687 if (start_pfn < usable_startpfn) { 4692 if (start_pfn < usable_startpfn) {
4688 unsigned long kernel_pages; 4693 unsigned long kernel_pages;
4689 kernel_pages = min(end_pfn, usable_startpfn) 4694 kernel_pages = min(end_pfn, usable_startpfn)
4690 - start_pfn; 4695 - start_pfn;
4691 4696
4692 kernelcore_remaining -= min(kernel_pages, 4697 kernelcore_remaining -= min(kernel_pages,
4693 kernelcore_remaining); 4698 kernelcore_remaining);
4694 required_kernelcore -= min(kernel_pages, 4699 required_kernelcore -= min(kernel_pages,
4695 required_kernelcore); 4700 required_kernelcore);
4696 4701
4697 /* Continue if range is now fully accounted */ 4702 /* Continue if range is now fully accounted */
4698 if (end_pfn <= usable_startpfn) { 4703 if (end_pfn <= usable_startpfn) {
4699 4704
4700 /* 4705 /*
4701 * Push zone_movable_pfn to the end so 4706 * Push zone_movable_pfn to the end so
4702 * that if we have to rebalance 4707 * that if we have to rebalance
4703 * kernelcore across nodes, we will 4708 * kernelcore across nodes, we will
4704 * not double account here 4709 * not double account here
4705 */ 4710 */
4706 zone_movable_pfn[nid] = end_pfn; 4711 zone_movable_pfn[nid] = end_pfn;
4707 continue; 4712 continue;
4708 } 4713 }
4709 start_pfn = usable_startpfn; 4714 start_pfn = usable_startpfn;
4710 } 4715 }
4711 4716
4712 /* 4717 /*
4713 * The usable PFN range for ZONE_MOVABLE is from 4718 * The usable PFN range for ZONE_MOVABLE is from
4714 * start_pfn->end_pfn. Calculate size_pages as the 4719 * start_pfn->end_pfn. Calculate size_pages as the
4715 * number of pages used as kernelcore 4720 * number of pages used as kernelcore
4716 */ 4721 */
4717 size_pages = end_pfn - start_pfn; 4722 size_pages = end_pfn - start_pfn;
4718 if (size_pages > kernelcore_remaining) 4723 if (size_pages > kernelcore_remaining)
4719 size_pages = kernelcore_remaining; 4724 size_pages = kernelcore_remaining;
4720 zone_movable_pfn[nid] = start_pfn + size_pages; 4725 zone_movable_pfn[nid] = start_pfn + size_pages;
4721 4726
4722 /* 4727 /*
4723 * Some kernelcore has been met, update counts and 4728 * Some kernelcore has been met, update counts and
4724 * break if the kernelcore for this node has been 4729 * break if the kernelcore for this node has been
4725 * satisified 4730 * satisified
4726 */ 4731 */
4727 required_kernelcore -= min(required_kernelcore, 4732 required_kernelcore -= min(required_kernelcore,
4728 size_pages); 4733 size_pages);
4729 kernelcore_remaining -= size_pages; 4734 kernelcore_remaining -= size_pages;
4730 if (!kernelcore_remaining) 4735 if (!kernelcore_remaining)
4731 break; 4736 break;
4732 } 4737 }
4733 } 4738 }
4734 4739
4735 /* 4740 /*
4736 * If there is still required_kernelcore, we do another pass with one 4741 * If there is still required_kernelcore, we do another pass with one
4737 * less node in the count. This will push zone_movable_pfn[nid] further 4742 * less node in the count. This will push zone_movable_pfn[nid] further
4738 * along on the nodes that still have memory until kernelcore is 4743 * along on the nodes that still have memory until kernelcore is
4739 * satisified 4744 * satisified
4740 */ 4745 */
4741 usable_nodes--; 4746 usable_nodes--;
4742 if (usable_nodes && required_kernelcore > usable_nodes) 4747 if (usable_nodes && required_kernelcore > usable_nodes)
4743 goto restart; 4748 goto restart;
4744 4749
4745 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 4750 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4746 for (nid = 0; nid < MAX_NUMNODES; nid++) 4751 for (nid = 0; nid < MAX_NUMNODES; nid++)
4747 zone_movable_pfn[nid] = 4752 zone_movable_pfn[nid] =
4748 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4753 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4749 4754
4750 out: 4755 out:
4751 /* restore the node_state */ 4756 /* restore the node_state */
4752 node_states[N_HIGH_MEMORY] = saved_node_state; 4757 node_states[N_HIGH_MEMORY] = saved_node_state;
4753 } 4758 }
4754 4759
4755 /* Any regular memory on that node ? */ 4760 /* Any regular memory on that node ? */
4756 static void check_for_regular_memory(pg_data_t *pgdat) 4761 static void check_for_regular_memory(pg_data_t *pgdat)
4757 { 4762 {
4758 #ifdef CONFIG_HIGHMEM 4763 #ifdef CONFIG_HIGHMEM
4759 enum zone_type zone_type; 4764 enum zone_type zone_type;
4760 4765
4761 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4766 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4762 struct zone *zone = &pgdat->node_zones[zone_type]; 4767 struct zone *zone = &pgdat->node_zones[zone_type];
4763 if (zone->present_pages) { 4768 if (zone->present_pages) {
4764 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4769 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4765 break; 4770 break;
4766 } 4771 }
4767 } 4772 }
4768 #endif 4773 #endif
4769 } 4774 }
4770 4775
4771 /** 4776 /**
4772 * free_area_init_nodes - Initialise all pg_data_t and zone data 4777 * free_area_init_nodes - Initialise all pg_data_t and zone data
4773 * @max_zone_pfn: an array of max PFNs for each zone 4778 * @max_zone_pfn: an array of max PFNs for each zone
4774 * 4779 *
4775 * This will call free_area_init_node() for each active node in the system. 4780 * This will call free_area_init_node() for each active node in the system.
4776 * Using the page ranges provided by add_active_range(), the size of each 4781 * Using the page ranges provided by add_active_range(), the size of each
4777 * zone in each node and their holes is calculated. If the maximum PFN 4782 * zone in each node and their holes is calculated. If the maximum PFN
4778 * between two adjacent zones match, it is assumed that the zone is empty. 4783 * between two adjacent zones match, it is assumed that the zone is empty.
4779 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 4784 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
4780 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 4785 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
4781 * starts where the previous one ended. For example, ZONE_DMA32 starts 4786 * starts where the previous one ended. For example, ZONE_DMA32 starts
4782 * at arch_max_dma_pfn. 4787 * at arch_max_dma_pfn.
4783 */ 4788 */
4784 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 4789 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4785 { 4790 {
4786 unsigned long start_pfn, end_pfn; 4791 unsigned long start_pfn, end_pfn;
4787 int i, nid; 4792 int i, nid;
4788 4793
4789 /* Record where the zone boundaries are */ 4794 /* Record where the zone boundaries are */
4790 memset(arch_zone_lowest_possible_pfn, 0, 4795 memset(arch_zone_lowest_possible_pfn, 0,
4791 sizeof(arch_zone_lowest_possible_pfn)); 4796 sizeof(arch_zone_lowest_possible_pfn));
4792 memset(arch_zone_highest_possible_pfn, 0, 4797 memset(arch_zone_highest_possible_pfn, 0,
4793 sizeof(arch_zone_highest_possible_pfn)); 4798 sizeof(arch_zone_highest_possible_pfn));
4794 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 4799 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4795 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 4800 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4796 for (i = 1; i < MAX_NR_ZONES; i++) { 4801 for (i = 1; i < MAX_NR_ZONES; i++) {
4797 if (i == ZONE_MOVABLE) 4802 if (i == ZONE_MOVABLE)
4798 continue; 4803 continue;
4799 arch_zone_lowest_possible_pfn[i] = 4804 arch_zone_lowest_possible_pfn[i] =
4800 arch_zone_highest_possible_pfn[i-1]; 4805 arch_zone_highest_possible_pfn[i-1];
4801 arch_zone_highest_possible_pfn[i] = 4806 arch_zone_highest_possible_pfn[i] =
4802 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 4807 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4803 } 4808 }
4804 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 4809 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4805 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 4810 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4806 4811
4807 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4812 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4808 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4813 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4809 find_zone_movable_pfns_for_nodes(); 4814 find_zone_movable_pfns_for_nodes();
4810 4815
4811 /* Print out the zone ranges */ 4816 /* Print out the zone ranges */
4812 printk("Zone ranges:\n"); 4817 printk("Zone ranges:\n");
4813 for (i = 0; i < MAX_NR_ZONES; i++) { 4818 for (i = 0; i < MAX_NR_ZONES; i++) {
4814 if (i == ZONE_MOVABLE) 4819 if (i == ZONE_MOVABLE)
4815 continue; 4820 continue;
4816 printk(KERN_CONT " %-8s ", zone_names[i]); 4821 printk(KERN_CONT " %-8s ", zone_names[i]);
4817 if (arch_zone_lowest_possible_pfn[i] == 4822 if (arch_zone_lowest_possible_pfn[i] ==
4818 arch_zone_highest_possible_pfn[i]) 4823 arch_zone_highest_possible_pfn[i])
4819 printk(KERN_CONT "empty\n"); 4824 printk(KERN_CONT "empty\n");
4820 else 4825 else
4821 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 4826 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4822 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 4827 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4823 (arch_zone_highest_possible_pfn[i] 4828 (arch_zone_highest_possible_pfn[i]
4824 << PAGE_SHIFT) - 1); 4829 << PAGE_SHIFT) - 1);
4825 } 4830 }
4826 4831
4827 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4832 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4828 printk("Movable zone start for each node\n"); 4833 printk("Movable zone start for each node\n");
4829 for (i = 0; i < MAX_NUMNODES; i++) { 4834 for (i = 0; i < MAX_NUMNODES; i++) {
4830 if (zone_movable_pfn[i]) 4835 if (zone_movable_pfn[i])
4831 printk(" Node %d: %#010lx\n", i, 4836 printk(" Node %d: %#010lx\n", i,
4832 zone_movable_pfn[i] << PAGE_SHIFT); 4837 zone_movable_pfn[i] << PAGE_SHIFT);
4833 } 4838 }
4834 4839
4835 /* Print out the early_node_map[] */ 4840 /* Print out the early_node_map[] */
4836 printk("Early memory node ranges\n"); 4841 printk("Early memory node ranges\n");
4837 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4842 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4838 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 4843 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4839 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 4844 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4840 4845
4841 /* Initialise every node */ 4846 /* Initialise every node */
4842 mminit_verify_pageflags_layout(); 4847 mminit_verify_pageflags_layout();
4843 setup_nr_node_ids(); 4848 setup_nr_node_ids();
4844 for_each_online_node(nid) { 4849 for_each_online_node(nid) {
4845 pg_data_t *pgdat = NODE_DATA(nid); 4850 pg_data_t *pgdat = NODE_DATA(nid);
4846 free_area_init_node(nid, NULL, 4851 free_area_init_node(nid, NULL,
4847 find_min_pfn_for_node(nid), NULL); 4852 find_min_pfn_for_node(nid), NULL);
4848 4853
4849 /* Any memory on that node */ 4854 /* Any memory on that node */
4850 if (pgdat->node_present_pages) 4855 if (pgdat->node_present_pages)
4851 node_set_state(nid, N_HIGH_MEMORY); 4856 node_set_state(nid, N_HIGH_MEMORY);
4852 check_for_regular_memory(pgdat); 4857 check_for_regular_memory(pgdat);
4853 } 4858 }
4854 } 4859 }
4855 4860
4856 static int __init cmdline_parse_core(char *p, unsigned long *core) 4861 static int __init cmdline_parse_core(char *p, unsigned long *core)
4857 { 4862 {
4858 unsigned long long coremem; 4863 unsigned long long coremem;
4859 if (!p) 4864 if (!p)
4860 return -EINVAL; 4865 return -EINVAL;
4861 4866
4862 coremem = memparse(p, &p); 4867 coremem = memparse(p, &p);
4863 *core = coremem >> PAGE_SHIFT; 4868 *core = coremem >> PAGE_SHIFT;
4864 4869
4865 /* Paranoid check that UL is enough for the coremem value */ 4870 /* Paranoid check that UL is enough for the coremem value */
4866 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 4871 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
4867 4872
4868 return 0; 4873 return 0;
4869 } 4874 }
4870 4875
4871 /* 4876 /*
4872 * kernelcore=size sets the amount of memory for use for allocations that 4877 * kernelcore=size sets the amount of memory for use for allocations that
4873 * cannot be reclaimed or migrated. 4878 * cannot be reclaimed or migrated.
4874 */ 4879 */
4875 static int __init cmdline_parse_kernelcore(char *p) 4880 static int __init cmdline_parse_kernelcore(char *p)
4876 { 4881 {
4877 return cmdline_parse_core(p, &required_kernelcore); 4882 return cmdline_parse_core(p, &required_kernelcore);
4878 } 4883 }
4879 4884
4880 /* 4885 /*
4881 * movablecore=size sets the amount of memory for use for allocations that 4886 * movablecore=size sets the amount of memory for use for allocations that
4882 * can be reclaimed or migrated. 4887 * can be reclaimed or migrated.
4883 */ 4888 */
4884 static int __init cmdline_parse_movablecore(char *p) 4889 static int __init cmdline_parse_movablecore(char *p)
4885 { 4890 {
4886 return cmdline_parse_core(p, &required_movablecore); 4891 return cmdline_parse_core(p, &required_movablecore);
4887 } 4892 }
4888 4893
4889 early_param("kernelcore", cmdline_parse_kernelcore); 4894 early_param("kernelcore", cmdline_parse_kernelcore);
4890 early_param("movablecore", cmdline_parse_movablecore); 4895 early_param("movablecore", cmdline_parse_movablecore);
4891 4896
4892 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4897 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4893 4898
4894 /** 4899 /**
4895 * set_dma_reserve - set the specified number of pages reserved in the first zone 4900 * set_dma_reserve - set the specified number of pages reserved in the first zone
4896 * @new_dma_reserve: The number of pages to mark reserved 4901 * @new_dma_reserve: The number of pages to mark reserved
4897 * 4902 *
4898 * The per-cpu batchsize and zone watermarks are determined by present_pages. 4903 * The per-cpu batchsize and zone watermarks are determined by present_pages.
4899 * In the DMA zone, a significant percentage may be consumed by kernel image 4904 * In the DMA zone, a significant percentage may be consumed by kernel image
4900 * and other unfreeable allocations which can skew the watermarks badly. This 4905 * and other unfreeable allocations which can skew the watermarks badly. This
4901 * function may optionally be used to account for unfreeable pages in the 4906 * function may optionally be used to account for unfreeable pages in the
4902 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 4907 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
4903 * smaller per-cpu batchsize. 4908 * smaller per-cpu batchsize.
4904 */ 4909 */
4905 void __init set_dma_reserve(unsigned long new_dma_reserve) 4910 void __init set_dma_reserve(unsigned long new_dma_reserve)
4906 { 4911 {
4907 dma_reserve = new_dma_reserve; 4912 dma_reserve = new_dma_reserve;
4908 } 4913 }
4909 4914
4910 void __init free_area_init(unsigned long *zones_size) 4915 void __init free_area_init(unsigned long *zones_size)
4911 { 4916 {
4912 free_area_init_node(0, zones_size, 4917 free_area_init_node(0, zones_size,
4913 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4918 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4914 } 4919 }
4915 4920
4916 static int page_alloc_cpu_notify(struct notifier_block *self, 4921 static int page_alloc_cpu_notify(struct notifier_block *self,
4917 unsigned long action, void *hcpu) 4922 unsigned long action, void *hcpu)
4918 { 4923 {
4919 int cpu = (unsigned long)hcpu; 4924 int cpu = (unsigned long)hcpu;
4920 4925
4921 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4926 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4922 lru_add_drain_cpu(cpu); 4927 lru_add_drain_cpu(cpu);
4923 drain_pages(cpu); 4928 drain_pages(cpu);
4924 4929
4925 /* 4930 /*
4926 * Spill the event counters of the dead processor 4931 * Spill the event counters of the dead processor
4927 * into the current processors event counters. 4932 * into the current processors event counters.
4928 * This artificially elevates the count of the current 4933 * This artificially elevates the count of the current
4929 * processor. 4934 * processor.
4930 */ 4935 */
4931 vm_events_fold_cpu(cpu); 4936 vm_events_fold_cpu(cpu);
4932 4937
4933 /* 4938 /*
4934 * Zero the differential counters of the dead processor 4939 * Zero the differential counters of the dead processor
4935 * so that the vm statistics are consistent. 4940 * so that the vm statistics are consistent.
4936 * 4941 *
4937 * This is only okay since the processor is dead and cannot 4942 * This is only okay since the processor is dead and cannot
4938 * race with what we are doing. 4943 * race with what we are doing.
4939 */ 4944 */
4940 refresh_cpu_vm_stats(cpu); 4945 refresh_cpu_vm_stats(cpu);
4941 } 4946 }
4942 return NOTIFY_OK; 4947 return NOTIFY_OK;
4943 } 4948 }
4944 4949
4945 void __init page_alloc_init(void) 4950 void __init page_alloc_init(void)
4946 { 4951 {
4947 hotcpu_notifier(page_alloc_cpu_notify, 0); 4952 hotcpu_notifier(page_alloc_cpu_notify, 0);
4948 } 4953 }
4949 4954
4950 /* 4955 /*
4951 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 4956 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
4952 * or min_free_kbytes changes. 4957 * or min_free_kbytes changes.
4953 */ 4958 */
4954 static void calculate_totalreserve_pages(void) 4959 static void calculate_totalreserve_pages(void)
4955 { 4960 {
4956 struct pglist_data *pgdat; 4961 struct pglist_data *pgdat;
4957 unsigned long reserve_pages = 0; 4962 unsigned long reserve_pages = 0;
4958 enum zone_type i, j; 4963 enum zone_type i, j;
4959 4964
4960 for_each_online_pgdat(pgdat) { 4965 for_each_online_pgdat(pgdat) {
4961 for (i = 0; i < MAX_NR_ZONES; i++) { 4966 for (i = 0; i < MAX_NR_ZONES; i++) {
4962 struct zone *zone = pgdat->node_zones + i; 4967 struct zone *zone = pgdat->node_zones + i;
4963 unsigned long max = 0; 4968 unsigned long max = 0;
4964 4969
4965 /* Find valid and maximum lowmem_reserve in the zone */ 4970 /* Find valid and maximum lowmem_reserve in the zone */
4966 for (j = i; j < MAX_NR_ZONES; j++) { 4971 for (j = i; j < MAX_NR_ZONES; j++) {
4967 if (zone->lowmem_reserve[j] > max) 4972 if (zone->lowmem_reserve[j] > max)
4968 max = zone->lowmem_reserve[j]; 4973 max = zone->lowmem_reserve[j];
4969 } 4974 }
4970 4975
4971 /* we treat the high watermark as reserved pages. */ 4976 /* we treat the high watermark as reserved pages. */
4972 max += high_wmark_pages(zone); 4977 max += high_wmark_pages(zone);
4973 4978
4974 if (max > zone->present_pages) 4979 if (max > zone->present_pages)
4975 max = zone->present_pages; 4980 max = zone->present_pages;
4976 reserve_pages += max; 4981 reserve_pages += max;
4977 /* 4982 /*
4978 * Lowmem reserves are not available to 4983 * Lowmem reserves are not available to
4979 * GFP_HIGHUSER page cache allocations and 4984 * GFP_HIGHUSER page cache allocations and
4980 * kswapd tries to balance zones to their high 4985 * kswapd tries to balance zones to their high
4981 * watermark. As a result, neither should be 4986 * watermark. As a result, neither should be
4982 * regarded as dirtyable memory, to prevent a 4987 * regarded as dirtyable memory, to prevent a
4983 * situation where reclaim has to clean pages 4988 * situation where reclaim has to clean pages
4984 * in order to balance the zones. 4989 * in order to balance the zones.
4985 */ 4990 */
4986 zone->dirty_balance_reserve = max; 4991 zone->dirty_balance_reserve = max;
4987 } 4992 }
4988 } 4993 }
4989 dirty_balance_reserve = reserve_pages; 4994 dirty_balance_reserve = reserve_pages;
4990 totalreserve_pages = reserve_pages; 4995 totalreserve_pages = reserve_pages;
4991 } 4996 }
4992 4997
4993 /* 4998 /*
4994 * setup_per_zone_lowmem_reserve - called whenever 4999 * setup_per_zone_lowmem_reserve - called whenever
4995 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5000 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
4996 * has a correct pages reserved value, so an adequate number of 5001 * has a correct pages reserved value, so an adequate number of
4997 * pages are left in the zone after a successful __alloc_pages(). 5002 * pages are left in the zone after a successful __alloc_pages().
4998 */ 5003 */
4999 static void setup_per_zone_lowmem_reserve(void) 5004 static void setup_per_zone_lowmem_reserve(void)
5000 { 5005 {
5001 struct pglist_data *pgdat; 5006 struct pglist_data *pgdat;
5002 enum zone_type j, idx; 5007 enum zone_type j, idx;
5003 5008
5004 for_each_online_pgdat(pgdat) { 5009 for_each_online_pgdat(pgdat) {
5005 for (j = 0; j < MAX_NR_ZONES; j++) { 5010 for (j = 0; j < MAX_NR_ZONES; j++) {
5006 struct zone *zone = pgdat->node_zones + j; 5011 struct zone *zone = pgdat->node_zones + j;
5007 unsigned long present_pages = zone->present_pages; 5012 unsigned long present_pages = zone->present_pages;
5008 5013
5009 zone->lowmem_reserve[j] = 0; 5014 zone->lowmem_reserve[j] = 0;
5010 5015
5011 idx = j; 5016 idx = j;
5012 while (idx) { 5017 while (idx) {
5013 struct zone *lower_zone; 5018 struct zone *lower_zone;
5014 5019
5015 idx--; 5020 idx--;
5016 5021
5017 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5022 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5018 sysctl_lowmem_reserve_ratio[idx] = 1; 5023 sysctl_lowmem_reserve_ratio[idx] = 1;
5019 5024
5020 lower_zone = pgdat->node_zones + idx; 5025 lower_zone = pgdat->node_zones + idx;
5021 lower_zone->lowmem_reserve[j] = present_pages / 5026 lower_zone->lowmem_reserve[j] = present_pages /
5022 sysctl_lowmem_reserve_ratio[idx]; 5027 sysctl_lowmem_reserve_ratio[idx];
5023 present_pages += lower_zone->present_pages; 5028 present_pages += lower_zone->present_pages;
5024 } 5029 }
5025 } 5030 }
5026 } 5031 }
5027 5032
5028 /* update totalreserve_pages */ 5033 /* update totalreserve_pages */
5029 calculate_totalreserve_pages(); 5034 calculate_totalreserve_pages();
5030 } 5035 }
5031 5036
5032 static void __setup_per_zone_wmarks(void) 5037 static void __setup_per_zone_wmarks(void)
5033 { 5038 {
5034 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5039 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5035 unsigned long lowmem_pages = 0; 5040 unsigned long lowmem_pages = 0;
5036 struct zone *zone; 5041 struct zone *zone;
5037 unsigned long flags; 5042 unsigned long flags;
5038 5043
5039 /* Calculate total number of !ZONE_HIGHMEM pages */ 5044 /* Calculate total number of !ZONE_HIGHMEM pages */
5040 for_each_zone(zone) { 5045 for_each_zone(zone) {
5041 if (!is_highmem(zone)) 5046 if (!is_highmem(zone))
5042 lowmem_pages += zone->present_pages; 5047 lowmem_pages += zone->present_pages;
5043 } 5048 }
5044 5049
5045 for_each_zone(zone) { 5050 for_each_zone(zone) {
5046 u64 tmp; 5051 u64 tmp;
5047 5052
5048 spin_lock_irqsave(&zone->lock, flags); 5053 spin_lock_irqsave(&zone->lock, flags);
5049 tmp = (u64)pages_min * zone->present_pages; 5054 tmp = (u64)pages_min * zone->present_pages;
5050 do_div(tmp, lowmem_pages); 5055 do_div(tmp, lowmem_pages);
5051 if (is_highmem(zone)) { 5056 if (is_highmem(zone)) {
5052 /* 5057 /*
5053 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5058 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5054 * need highmem pages, so cap pages_min to a small 5059 * need highmem pages, so cap pages_min to a small
5055 * value here. 5060 * value here.
5056 * 5061 *
5057 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5062 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5058 * deltas controls asynch page reclaim, and so should 5063 * deltas controls asynch page reclaim, and so should
5059 * not be capped for highmem. 5064 * not be capped for highmem.
5060 */ 5065 */
5061 int min_pages; 5066 int min_pages;
5062 5067
5063 min_pages = zone->present_pages / 1024; 5068 min_pages = zone->present_pages / 1024;
5064 if (min_pages < SWAP_CLUSTER_MAX) 5069 if (min_pages < SWAP_CLUSTER_MAX)
5065 min_pages = SWAP_CLUSTER_MAX; 5070 min_pages = SWAP_CLUSTER_MAX;
5066 if (min_pages > 128) 5071 if (min_pages > 128)
5067 min_pages = 128; 5072 min_pages = 128;
5068 zone->watermark[WMARK_MIN] = min_pages; 5073 zone->watermark[WMARK_MIN] = min_pages;
5069 } else { 5074 } else {
5070 /* 5075 /*
5071 * If it's a lowmem zone, reserve a number of pages 5076 * If it's a lowmem zone, reserve a number of pages
5072 * proportionate to the zone's size. 5077 * proportionate to the zone's size.
5073 */ 5078 */
5074 zone->watermark[WMARK_MIN] = tmp; 5079 zone->watermark[WMARK_MIN] = tmp;
5075 } 5080 }
5076 5081
5077 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5082 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5078 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5083 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5079 5084
5080 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); 5085 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5081 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); 5086 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5082 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); 5087 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5083 5088
5084 setup_zone_migrate_reserve(zone); 5089 setup_zone_migrate_reserve(zone);
5085 spin_unlock_irqrestore(&zone->lock, flags); 5090 spin_unlock_irqrestore(&zone->lock, flags);
5086 } 5091 }
5087 5092
5088 /* update totalreserve_pages */ 5093 /* update totalreserve_pages */
5089 calculate_totalreserve_pages(); 5094 calculate_totalreserve_pages();
5090 } 5095 }
5091 5096
5092 /** 5097 /**
5093 * setup_per_zone_wmarks - called when min_free_kbytes changes 5098 * setup_per_zone_wmarks - called when min_free_kbytes changes
5094 * or when memory is hot-{added|removed} 5099 * or when memory is hot-{added|removed}
5095 * 5100 *
5096 * Ensures that the watermark[min,low,high] values for each zone are set 5101 * Ensures that the watermark[min,low,high] values for each zone are set
5097 * correctly with respect to min_free_kbytes. 5102 * correctly with respect to min_free_kbytes.
5098 */ 5103 */
5099 void setup_per_zone_wmarks(void) 5104 void setup_per_zone_wmarks(void)
5100 { 5105 {
5101 mutex_lock(&zonelists_mutex); 5106 mutex_lock(&zonelists_mutex);
5102 __setup_per_zone_wmarks(); 5107 __setup_per_zone_wmarks();
5103 mutex_unlock(&zonelists_mutex); 5108 mutex_unlock(&zonelists_mutex);
5104 } 5109 }
5105 5110
5106 /* 5111 /*
5107 * The inactive anon list should be small enough that the VM never has to 5112 * The inactive anon list should be small enough that the VM never has to
5108 * do too much work, but large enough that each inactive page has a chance 5113 * do too much work, but large enough that each inactive page has a chance
5109 * to be referenced again before it is swapped out. 5114 * to be referenced again before it is swapped out.
5110 * 5115 *
5111 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5116 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5112 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5117 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5113 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5118 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5114 * the anonymous pages are kept on the inactive list. 5119 * the anonymous pages are kept on the inactive list.
5115 * 5120 *
5116 * total target max 5121 * total target max
5117 * memory ratio inactive anon 5122 * memory ratio inactive anon
5118 * ------------------------------------- 5123 * -------------------------------------
5119 * 10MB 1 5MB 5124 * 10MB 1 5MB
5120 * 100MB 1 50MB 5125 * 100MB 1 50MB
5121 * 1GB 3 250MB 5126 * 1GB 3 250MB
5122 * 10GB 10 0.9GB 5127 * 10GB 10 0.9GB
5123 * 100GB 31 3GB 5128 * 100GB 31 3GB
5124 * 1TB 101 10GB 5129 * 1TB 101 10GB
5125 * 10TB 320 32GB 5130 * 10TB 320 32GB
5126 */ 5131 */
5127 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5132 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5128 { 5133 {
5129 unsigned int gb, ratio; 5134 unsigned int gb, ratio;
5130 5135
5131 /* Zone size in gigabytes */ 5136 /* Zone size in gigabytes */
5132 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5137 gb = zone->present_pages >> (30 - PAGE_SHIFT);
5133 if (gb) 5138 if (gb)
5134 ratio = int_sqrt(10 * gb); 5139 ratio = int_sqrt(10 * gb);
5135 else 5140 else
5136 ratio = 1; 5141 ratio = 1;
5137 5142
5138 zone->inactive_ratio = ratio; 5143 zone->inactive_ratio = ratio;
5139 } 5144 }
5140 5145
5141 static void __meminit setup_per_zone_inactive_ratio(void) 5146 static void __meminit setup_per_zone_inactive_ratio(void)
5142 { 5147 {
5143 struct zone *zone; 5148 struct zone *zone;
5144 5149
5145 for_each_zone(zone) 5150 for_each_zone(zone)
5146 calculate_zone_inactive_ratio(zone); 5151 calculate_zone_inactive_ratio(zone);
5147 } 5152 }
5148 5153
5149 /* 5154 /*
5150 * Initialise min_free_kbytes. 5155 * Initialise min_free_kbytes.
5151 * 5156 *
5152 * For small machines we want it small (128k min). For large machines 5157 * For small machines we want it small (128k min). For large machines
5153 * we want it large (64MB max). But it is not linear, because network 5158 * we want it large (64MB max). But it is not linear, because network
5154 * bandwidth does not increase linearly with machine size. We use 5159 * bandwidth does not increase linearly with machine size. We use
5155 * 5160 *
5156 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5161 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5157 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5162 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5158 * 5163 *
5159 * which yields 5164 * which yields
5160 * 5165 *
5161 * 16MB: 512k 5166 * 16MB: 512k
5162 * 32MB: 724k 5167 * 32MB: 724k
5163 * 64MB: 1024k 5168 * 64MB: 1024k
5164 * 128MB: 1448k 5169 * 128MB: 1448k
5165 * 256MB: 2048k 5170 * 256MB: 2048k
5166 * 512MB: 2896k 5171 * 512MB: 2896k
5167 * 1024MB: 4096k 5172 * 1024MB: 4096k
5168 * 2048MB: 5792k 5173 * 2048MB: 5792k
5169 * 4096MB: 8192k 5174 * 4096MB: 8192k
5170 * 8192MB: 11584k 5175 * 8192MB: 11584k
5171 * 16384MB: 16384k 5176 * 16384MB: 16384k
5172 */ 5177 */
5173 int __meminit init_per_zone_wmark_min(void) 5178 int __meminit init_per_zone_wmark_min(void)
5174 { 5179 {
5175 unsigned long lowmem_kbytes; 5180 unsigned long lowmem_kbytes;
5176 5181
5177 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5182 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5178 5183
5179 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5184 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5180 if (min_free_kbytes < 128) 5185 if (min_free_kbytes < 128)
5181 min_free_kbytes = 128; 5186 min_free_kbytes = 128;
5182 if (min_free_kbytes > 65536) 5187 if (min_free_kbytes > 65536)
5183 min_free_kbytes = 65536; 5188 min_free_kbytes = 65536;
5184 setup_per_zone_wmarks(); 5189 setup_per_zone_wmarks();
5185 refresh_zone_stat_thresholds(); 5190 refresh_zone_stat_thresholds();
5186 setup_per_zone_lowmem_reserve(); 5191 setup_per_zone_lowmem_reserve();
5187 setup_per_zone_inactive_ratio(); 5192 setup_per_zone_inactive_ratio();
5188 return 0; 5193 return 0;
5189 } 5194 }
5190 module_init(init_per_zone_wmark_min) 5195 module_init(init_per_zone_wmark_min)
5191 5196
5192 /* 5197 /*
5193 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5198 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5194 * that we can call two helper functions whenever min_free_kbytes 5199 * that we can call two helper functions whenever min_free_kbytes
5195 * changes. 5200 * changes.
5196 */ 5201 */
5197 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5202 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5198 void __user *buffer, size_t *length, loff_t *ppos) 5203 void __user *buffer, size_t *length, loff_t *ppos)
5199 { 5204 {
5200 proc_dointvec(table, write, buffer, length, ppos); 5205 proc_dointvec(table, write, buffer, length, ppos);
5201 if (write) 5206 if (write)
5202 setup_per_zone_wmarks(); 5207 setup_per_zone_wmarks();
5203 return 0; 5208 return 0;
5204 } 5209 }
5205 5210
5206 #ifdef CONFIG_NUMA 5211 #ifdef CONFIG_NUMA
5207 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5212 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5208 void __user *buffer, size_t *length, loff_t *ppos) 5213 void __user *buffer, size_t *length, loff_t *ppos)
5209 { 5214 {
5210 struct zone *zone; 5215 struct zone *zone;
5211 int rc; 5216 int rc;
5212 5217
5213 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5218 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5214 if (rc) 5219 if (rc)
5215 return rc; 5220 return rc;
5216 5221
5217 for_each_zone(zone) 5222 for_each_zone(zone)
5218 zone->min_unmapped_pages = (zone->present_pages * 5223 zone->min_unmapped_pages = (zone->present_pages *
5219 sysctl_min_unmapped_ratio) / 100; 5224 sysctl_min_unmapped_ratio) / 100;
5220 return 0; 5225 return 0;
5221 } 5226 }
5222 5227
5223 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5228 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5224 void __user *buffer, size_t *length, loff_t *ppos) 5229 void __user *buffer, size_t *length, loff_t *ppos)
5225 { 5230 {
5226 struct zone *zone; 5231 struct zone *zone;
5227 int rc; 5232 int rc;
5228 5233
5229 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5234 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5230 if (rc) 5235 if (rc)
5231 return rc; 5236 return rc;
5232 5237
5233 for_each_zone(zone) 5238 for_each_zone(zone)
5234 zone->min_slab_pages = (zone->present_pages * 5239 zone->min_slab_pages = (zone->present_pages *
5235 sysctl_min_slab_ratio) / 100; 5240 sysctl_min_slab_ratio) / 100;
5236 return 0; 5241 return 0;
5237 } 5242 }
5238 #endif 5243 #endif
5239 5244
5240 /* 5245 /*
5241 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5246 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5242 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5247 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5243 * whenever sysctl_lowmem_reserve_ratio changes. 5248 * whenever sysctl_lowmem_reserve_ratio changes.
5244 * 5249 *
5245 * The reserve ratio obviously has absolutely no relation with the 5250 * The reserve ratio obviously has absolutely no relation with the
5246 * minimum watermarks. The lowmem reserve ratio can only make sense 5251 * minimum watermarks. The lowmem reserve ratio can only make sense
5247 * if in function of the boot time zone sizes. 5252 * if in function of the boot time zone sizes.
5248 */ 5253 */
5249 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5254 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5250 void __user *buffer, size_t *length, loff_t *ppos) 5255 void __user *buffer, size_t *length, loff_t *ppos)
5251 { 5256 {
5252 proc_dointvec_minmax(table, write, buffer, length, ppos); 5257 proc_dointvec_minmax(table, write, buffer, length, ppos);
5253 setup_per_zone_lowmem_reserve(); 5258 setup_per_zone_lowmem_reserve();
5254 return 0; 5259 return 0;
5255 } 5260 }
5256 5261
5257 /* 5262 /*
5258 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5263 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5259 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5264 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5260 * can have before it gets flushed back to buddy allocator. 5265 * can have before it gets flushed back to buddy allocator.
5261 */ 5266 */
5262 5267
5263 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5268 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5264 void __user *buffer, size_t *length, loff_t *ppos) 5269 void __user *buffer, size_t *length, loff_t *ppos)
5265 { 5270 {
5266 struct zone *zone; 5271 struct zone *zone;
5267 unsigned int cpu; 5272 unsigned int cpu;
5268 int ret; 5273 int ret;
5269 5274
5270 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5275 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5271 if (!write || (ret < 0)) 5276 if (!write || (ret < 0))
5272 return ret; 5277 return ret;
5273 for_each_populated_zone(zone) { 5278 for_each_populated_zone(zone) {
5274 for_each_possible_cpu(cpu) { 5279 for_each_possible_cpu(cpu) {
5275 unsigned long high; 5280 unsigned long high;
5276 high = zone->present_pages / percpu_pagelist_fraction; 5281 high = zone->present_pages / percpu_pagelist_fraction;
5277 setup_pagelist_highmark( 5282 setup_pagelist_highmark(
5278 per_cpu_ptr(zone->pageset, cpu), high); 5283 per_cpu_ptr(zone->pageset, cpu), high);
5279 } 5284 }
5280 } 5285 }
5281 return 0; 5286 return 0;
5282 } 5287 }
5283 5288
5284 int hashdist = HASHDIST_DEFAULT; 5289 int hashdist = HASHDIST_DEFAULT;
5285 5290
5286 #ifdef CONFIG_NUMA 5291 #ifdef CONFIG_NUMA
5287 static int __init set_hashdist(char *str) 5292 static int __init set_hashdist(char *str)
5288 { 5293 {
5289 if (!str) 5294 if (!str)
5290 return 0; 5295 return 0;
5291 hashdist = simple_strtoul(str, &str, 0); 5296 hashdist = simple_strtoul(str, &str, 0);
5292 return 1; 5297 return 1;
5293 } 5298 }
5294 __setup("hashdist=", set_hashdist); 5299 __setup("hashdist=", set_hashdist);
5295 #endif 5300 #endif
5296 5301
5297 /* 5302 /*
5298 * allocate a large system hash table from bootmem 5303 * allocate a large system hash table from bootmem
5299 * - it is assumed that the hash table must contain an exact power-of-2 5304 * - it is assumed that the hash table must contain an exact power-of-2
5300 * quantity of entries 5305 * quantity of entries
5301 * - limit is the number of hash buckets, not the total allocation size 5306 * - limit is the number of hash buckets, not the total allocation size
5302 */ 5307 */
5303 void *__init alloc_large_system_hash(const char *tablename, 5308 void *__init alloc_large_system_hash(const char *tablename,
5304 unsigned long bucketsize, 5309 unsigned long bucketsize,
5305 unsigned long numentries, 5310 unsigned long numentries,
5306 int scale, 5311 int scale,
5307 int flags, 5312 int flags,
5308 unsigned int *_hash_shift, 5313 unsigned int *_hash_shift,
5309 unsigned int *_hash_mask, 5314 unsigned int *_hash_mask,
5310 unsigned long low_limit, 5315 unsigned long low_limit,
5311 unsigned long high_limit) 5316 unsigned long high_limit)
5312 { 5317 {
5313 unsigned long long max = high_limit; 5318 unsigned long long max = high_limit;
5314 unsigned long log2qty, size; 5319 unsigned long log2qty, size;
5315 void *table = NULL; 5320 void *table = NULL;
5316 5321
5317 /* allow the kernel cmdline to have a say */ 5322 /* allow the kernel cmdline to have a say */
5318 if (!numentries) { 5323 if (!numentries) {
5319 /* round applicable memory size up to nearest megabyte */ 5324 /* round applicable memory size up to nearest megabyte */
5320 numentries = nr_kernel_pages; 5325 numentries = nr_kernel_pages;
5321 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 5326 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
5322 numentries >>= 20 - PAGE_SHIFT; 5327 numentries >>= 20 - PAGE_SHIFT;
5323 numentries <<= 20 - PAGE_SHIFT; 5328 numentries <<= 20 - PAGE_SHIFT;
5324 5329
5325 /* limit to 1 bucket per 2^scale bytes of low memory */ 5330 /* limit to 1 bucket per 2^scale bytes of low memory */
5326 if (scale > PAGE_SHIFT) 5331 if (scale > PAGE_SHIFT)
5327 numentries >>= (scale - PAGE_SHIFT); 5332 numentries >>= (scale - PAGE_SHIFT);
5328 else 5333 else
5329 numentries <<= (PAGE_SHIFT - scale); 5334 numentries <<= (PAGE_SHIFT - scale);
5330 5335
5331 /* Make sure we've got at least a 0-order allocation.. */ 5336 /* Make sure we've got at least a 0-order allocation.. */
5332 if (unlikely(flags & HASH_SMALL)) { 5337 if (unlikely(flags & HASH_SMALL)) {
5333 /* Makes no sense without HASH_EARLY */ 5338 /* Makes no sense without HASH_EARLY */
5334 WARN_ON(!(flags & HASH_EARLY)); 5339 WARN_ON(!(flags & HASH_EARLY));
5335 if (!(numentries >> *_hash_shift)) { 5340 if (!(numentries >> *_hash_shift)) {
5336 numentries = 1UL << *_hash_shift; 5341 numentries = 1UL << *_hash_shift;
5337 BUG_ON(!numentries); 5342 BUG_ON(!numentries);
5338 } 5343 }
5339 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5344 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5340 numentries = PAGE_SIZE / bucketsize; 5345 numentries = PAGE_SIZE / bucketsize;
5341 } 5346 }
5342 numentries = roundup_pow_of_two(numentries); 5347 numentries = roundup_pow_of_two(numentries);
5343 5348
5344 /* limit allocation size to 1/16 total memory by default */ 5349 /* limit allocation size to 1/16 total memory by default */
5345 if (max == 0) { 5350 if (max == 0) {
5346 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5351 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5347 do_div(max, bucketsize); 5352 do_div(max, bucketsize);
5348 } 5353 }
5349 max = min(max, 0x80000000ULL); 5354 max = min(max, 0x80000000ULL);
5350 5355
5351 if (numentries < low_limit) 5356 if (numentries < low_limit)
5352 numentries = low_limit; 5357 numentries = low_limit;
5353 if (numentries > max) 5358 if (numentries > max)
5354 numentries = max; 5359 numentries = max;
5355 5360
5356 log2qty = ilog2(numentries); 5361 log2qty = ilog2(numentries);
5357 5362
5358 do { 5363 do {
5359 size = bucketsize << log2qty; 5364 size = bucketsize << log2qty;
5360 if (flags & HASH_EARLY) 5365 if (flags & HASH_EARLY)
5361 table = alloc_bootmem_nopanic(size); 5366 table = alloc_bootmem_nopanic(size);
5362 else if (hashdist) 5367 else if (hashdist)
5363 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5368 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5364 else { 5369 else {
5365 /* 5370 /*
5366 * If bucketsize is not a power-of-two, we may free 5371 * If bucketsize is not a power-of-two, we may free
5367 * some pages at the end of hash table which 5372 * some pages at the end of hash table which
5368 * alloc_pages_exact() automatically does 5373 * alloc_pages_exact() automatically does
5369 */ 5374 */
5370 if (get_order(size) < MAX_ORDER) { 5375 if (get_order(size) < MAX_ORDER) {
5371 table = alloc_pages_exact(size, GFP_ATOMIC); 5376 table = alloc_pages_exact(size, GFP_ATOMIC);
5372 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5377 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5373 } 5378 }
5374 } 5379 }
5375 } while (!table && size > PAGE_SIZE && --log2qty); 5380 } while (!table && size > PAGE_SIZE && --log2qty);
5376 5381
5377 if (!table) 5382 if (!table)
5378 panic("Failed to allocate %s hash table\n", tablename); 5383 panic("Failed to allocate %s hash table\n", tablename);
5379 5384
5380 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5385 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5381 tablename, 5386 tablename,
5382 (1UL << log2qty), 5387 (1UL << log2qty),
5383 ilog2(size) - PAGE_SHIFT, 5388 ilog2(size) - PAGE_SHIFT,
5384 size); 5389 size);
5385 5390
5386 if (_hash_shift) 5391 if (_hash_shift)
5387 *_hash_shift = log2qty; 5392 *_hash_shift = log2qty;
5388 if (_hash_mask) 5393 if (_hash_mask)
5389 *_hash_mask = (1 << log2qty) - 1; 5394 *_hash_mask = (1 << log2qty) - 1;
5390 5395
5391 return table; 5396 return table;
5392 } 5397 }
5393 5398
5394 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5399 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5395 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5400 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5396 unsigned long pfn) 5401 unsigned long pfn)
5397 { 5402 {
5398 #ifdef CONFIG_SPARSEMEM 5403 #ifdef CONFIG_SPARSEMEM
5399 return __pfn_to_section(pfn)->pageblock_flags; 5404 return __pfn_to_section(pfn)->pageblock_flags;
5400 #else 5405 #else
5401 return zone->pageblock_flags; 5406 return zone->pageblock_flags;
5402 #endif /* CONFIG_SPARSEMEM */ 5407 #endif /* CONFIG_SPARSEMEM */
5403 } 5408 }
5404 5409
5405 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5410 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5406 { 5411 {
5407 #ifdef CONFIG_SPARSEMEM 5412 #ifdef CONFIG_SPARSEMEM
5408 pfn &= (PAGES_PER_SECTION-1); 5413 pfn &= (PAGES_PER_SECTION-1);
5409 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5414 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5410 #else 5415 #else
5411 pfn = pfn - zone->zone_start_pfn; 5416 pfn = pfn - zone->zone_start_pfn;
5412 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5417 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5413 #endif /* CONFIG_SPARSEMEM */ 5418 #endif /* CONFIG_SPARSEMEM */
5414 } 5419 }
5415 5420
5416 /** 5421 /**
5417 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5422 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5418 * @page: The page within the block of interest 5423 * @page: The page within the block of interest
5419 * @start_bitidx: The first bit of interest to retrieve 5424 * @start_bitidx: The first bit of interest to retrieve
5420 * @end_bitidx: The last bit of interest 5425 * @end_bitidx: The last bit of interest
5421 * returns pageblock_bits flags 5426 * returns pageblock_bits flags
5422 */ 5427 */
5423 unsigned long get_pageblock_flags_group(struct page *page, 5428 unsigned long get_pageblock_flags_group(struct page *page,
5424 int start_bitidx, int end_bitidx) 5429 int start_bitidx, int end_bitidx)
5425 { 5430 {
5426 struct zone *zone; 5431 struct zone *zone;
5427 unsigned long *bitmap; 5432 unsigned long *bitmap;
5428 unsigned long pfn, bitidx; 5433 unsigned long pfn, bitidx;
5429 unsigned long flags = 0; 5434 unsigned long flags = 0;
5430 unsigned long value = 1; 5435 unsigned long value = 1;
5431 5436
5432 zone = page_zone(page); 5437 zone = page_zone(page);
5433 pfn = page_to_pfn(page); 5438 pfn = page_to_pfn(page);
5434 bitmap = get_pageblock_bitmap(zone, pfn); 5439 bitmap = get_pageblock_bitmap(zone, pfn);
5435 bitidx = pfn_to_bitidx(zone, pfn); 5440 bitidx = pfn_to_bitidx(zone, pfn);
5436 5441
5437 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5442 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5438 if (test_bit(bitidx + start_bitidx, bitmap)) 5443 if (test_bit(bitidx + start_bitidx, bitmap))
5439 flags |= value; 5444 flags |= value;
5440 5445
5441 return flags; 5446 return flags;
5442 } 5447 }
5443 5448
5444 /** 5449 /**
5445 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 5450 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5446 * @page: The page within the block of interest 5451 * @page: The page within the block of interest
5447 * @start_bitidx: The first bit of interest 5452 * @start_bitidx: The first bit of interest
5448 * @end_bitidx: The last bit of interest 5453 * @end_bitidx: The last bit of interest
5449 * @flags: The flags to set 5454 * @flags: The flags to set
5450 */ 5455 */
5451 void set_pageblock_flags_group(struct page *page, unsigned long flags, 5456 void set_pageblock_flags_group(struct page *page, unsigned long flags,
5452 int start_bitidx, int end_bitidx) 5457 int start_bitidx, int end_bitidx)
5453 { 5458 {
5454 struct zone *zone; 5459 struct zone *zone;
5455 unsigned long *bitmap; 5460 unsigned long *bitmap;
5456 unsigned long pfn, bitidx; 5461 unsigned long pfn, bitidx;
5457 unsigned long value = 1; 5462 unsigned long value = 1;
5458 5463
5459 zone = page_zone(page); 5464 zone = page_zone(page);
5460 pfn = page_to_pfn(page); 5465 pfn = page_to_pfn(page);
5461 bitmap = get_pageblock_bitmap(zone, pfn); 5466 bitmap = get_pageblock_bitmap(zone, pfn);
5462 bitidx = pfn_to_bitidx(zone, pfn); 5467 bitidx = pfn_to_bitidx(zone, pfn);
5463 VM_BUG_ON(pfn < zone->zone_start_pfn); 5468 VM_BUG_ON(pfn < zone->zone_start_pfn);
5464 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); 5469 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5465 5470
5466 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5471 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5467 if (flags & value) 5472 if (flags & value)
5468 __set_bit(bitidx + start_bitidx, bitmap); 5473 __set_bit(bitidx + start_bitidx, bitmap);
5469 else 5474 else
5470 __clear_bit(bitidx + start_bitidx, bitmap); 5475 __clear_bit(bitidx + start_bitidx, bitmap);
5471 } 5476 }
5472 5477
5473 /* 5478 /*
5474 * This function checks whether pageblock includes unmovable pages or not. 5479 * This function checks whether pageblock includes unmovable pages or not.
5475 * If @count is not zero, it is okay to include less @count unmovable pages 5480 * If @count is not zero, it is okay to include less @count unmovable pages
5476 * 5481 *
5477 * PageLRU check wihtout isolation or lru_lock could race so that 5482 * PageLRU check wihtout isolation or lru_lock could race so that
5478 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5483 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5479 * expect this function should be exact. 5484 * expect this function should be exact.
5480 */ 5485 */
5481 static bool 5486 static bool
5482 __has_unmovable_pages(struct zone *zone, struct page *page, int count) 5487 __has_unmovable_pages(struct zone *zone, struct page *page, int count)
5483 { 5488 {
5484 unsigned long pfn, iter, found; 5489 unsigned long pfn, iter, found;
5485 int mt; 5490 int mt;
5486 5491
5487 /* 5492 /*
5488 * For avoiding noise data, lru_add_drain_all() should be called 5493 * For avoiding noise data, lru_add_drain_all() should be called
5489 * If ZONE_MOVABLE, the zone never contains unmovable pages 5494 * If ZONE_MOVABLE, the zone never contains unmovable pages
5490 */ 5495 */
5491 if (zone_idx(zone) == ZONE_MOVABLE) 5496 if (zone_idx(zone) == ZONE_MOVABLE)
5492 return false; 5497 return false;
5493 mt = get_pageblock_migratetype(page); 5498 mt = get_pageblock_migratetype(page);
5494 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5499 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5495 return false; 5500 return false;
5496 5501
5497 pfn = page_to_pfn(page); 5502 pfn = page_to_pfn(page);
5498 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5503 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5499 unsigned long check = pfn + iter; 5504 unsigned long check = pfn + iter;
5500 5505
5501 if (!pfn_valid_within(check)) 5506 if (!pfn_valid_within(check))
5502 continue; 5507 continue;
5503 5508
5504 page = pfn_to_page(check); 5509 page = pfn_to_page(check);
5505 /* 5510 /*
5506 * We can't use page_count without pin a page 5511 * We can't use page_count without pin a page
5507 * because another CPU can free compound page. 5512 * because another CPU can free compound page.
5508 * This check already skips compound tails of THP 5513 * This check already skips compound tails of THP
5509 * because their page->_count is zero at all time. 5514 * because their page->_count is zero at all time.
5510 */ 5515 */
5511 if (!atomic_read(&page->_count)) { 5516 if (!atomic_read(&page->_count)) {
5512 if (PageBuddy(page)) 5517 if (PageBuddy(page))
5513 iter += (1 << page_order(page)) - 1; 5518 iter += (1 << page_order(page)) - 1;
5514 continue; 5519 continue;
5515 } 5520 }
5516 5521
5517 if (!PageLRU(page)) 5522 if (!PageLRU(page))
5518 found++; 5523 found++;
5519 /* 5524 /*
5520 * If there are RECLAIMABLE pages, we need to check it. 5525 * If there are RECLAIMABLE pages, we need to check it.
5521 * But now, memory offline itself doesn't call shrink_slab() 5526 * But now, memory offline itself doesn't call shrink_slab()
5522 * and it still to be fixed. 5527 * and it still to be fixed.
5523 */ 5528 */
5524 /* 5529 /*
5525 * If the page is not RAM, page_count()should be 0. 5530 * If the page is not RAM, page_count()should be 0.
5526 * we don't need more check. This is an _used_ not-movable page. 5531 * we don't need more check. This is an _used_ not-movable page.
5527 * 5532 *
5528 * The problematic thing here is PG_reserved pages. PG_reserved 5533 * The problematic thing here is PG_reserved pages. PG_reserved
5529 * is set to both of a memory hole page and a _used_ kernel 5534 * is set to both of a memory hole page and a _used_ kernel
5530 * page at boot. 5535 * page at boot.
5531 */ 5536 */
5532 if (found > count) 5537 if (found > count)
5533 return true; 5538 return true;
5534 } 5539 }
5535 return false; 5540 return false;
5536 } 5541 }
5537 5542
5538 bool is_pageblock_removable_nolock(struct page *page) 5543 bool is_pageblock_removable_nolock(struct page *page)
5539 { 5544 {
5540 struct zone *zone; 5545 struct zone *zone;
5541 unsigned long pfn; 5546 unsigned long pfn;
5542 5547
5543 /* 5548 /*
5544 * We have to be careful here because we are iterating over memory 5549 * We have to be careful here because we are iterating over memory
5545 * sections which are not zone aware so we might end up outside of 5550 * sections which are not zone aware so we might end up outside of
5546 * the zone but still within the section. 5551 * the zone but still within the section.
5547 * We have to take care about the node as well. If the node is offline 5552 * We have to take care about the node as well. If the node is offline
5548 * its NODE_DATA will be NULL - see page_zone. 5553 * its NODE_DATA will be NULL - see page_zone.
5549 */ 5554 */
5550 if (!node_online(page_to_nid(page))) 5555 if (!node_online(page_to_nid(page)))
5551 return false; 5556 return false;
5552 5557
5553 zone = page_zone(page); 5558 zone = page_zone(page);
5554 pfn = page_to_pfn(page); 5559 pfn = page_to_pfn(page);
5555 if (zone->zone_start_pfn > pfn || 5560 if (zone->zone_start_pfn > pfn ||
5556 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5561 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5557 return false; 5562 return false;
5558 5563
5559 return !__has_unmovable_pages(zone, page, 0); 5564 return !__has_unmovable_pages(zone, page, 0);
5560 } 5565 }
5561 5566
5562 int set_migratetype_isolate(struct page *page) 5567 int set_migratetype_isolate(struct page *page)
5563 { 5568 {
5564 struct zone *zone; 5569 struct zone *zone;
5565 unsigned long flags, pfn; 5570 unsigned long flags, pfn;
5566 struct memory_isolate_notify arg; 5571 struct memory_isolate_notify arg;
5567 int notifier_ret; 5572 int notifier_ret;
5568 int ret = -EBUSY; 5573 int ret = -EBUSY;
5569 5574
5570 zone = page_zone(page); 5575 zone = page_zone(page);
5571 5576
5572 spin_lock_irqsave(&zone->lock, flags); 5577 spin_lock_irqsave(&zone->lock, flags);
5573 5578
5574 pfn = page_to_pfn(page); 5579 pfn = page_to_pfn(page);
5575 arg.start_pfn = pfn; 5580 arg.start_pfn = pfn;
5576 arg.nr_pages = pageblock_nr_pages; 5581 arg.nr_pages = pageblock_nr_pages;
5577 arg.pages_found = 0; 5582 arg.pages_found = 0;
5578 5583
5579 /* 5584 /*
5580 * It may be possible to isolate a pageblock even if the 5585 * It may be possible to isolate a pageblock even if the
5581 * migratetype is not MIGRATE_MOVABLE. The memory isolation 5586 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5582 * notifier chain is used by balloon drivers to return the 5587 * notifier chain is used by balloon drivers to return the
5583 * number of pages in a range that are held by the balloon 5588 * number of pages in a range that are held by the balloon
5584 * driver to shrink memory. If all the pages are accounted for 5589 * driver to shrink memory. If all the pages are accounted for
5585 * by balloons, are free, or on the LRU, isolation can continue. 5590 * by balloons, are free, or on the LRU, isolation can continue.
5586 * Later, for example, when memory hotplug notifier runs, these 5591 * Later, for example, when memory hotplug notifier runs, these
5587 * pages reported as "can be isolated" should be isolated(freed) 5592 * pages reported as "can be isolated" should be isolated(freed)
5588 * by the balloon driver through the memory notifier chain. 5593 * by the balloon driver through the memory notifier chain.
5589 */ 5594 */
5590 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5595 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5591 notifier_ret = notifier_to_errno(notifier_ret); 5596 notifier_ret = notifier_to_errno(notifier_ret);
5592 if (notifier_ret) 5597 if (notifier_ret)
5593 goto out; 5598 goto out;
5594 /* 5599 /*
5595 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 5600 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5596 * We just check MOVABLE pages. 5601 * We just check MOVABLE pages.
5597 */ 5602 */
5598 if (!__has_unmovable_pages(zone, page, arg.pages_found)) 5603 if (!__has_unmovable_pages(zone, page, arg.pages_found))
5599 ret = 0; 5604 ret = 0;
5600 /* 5605 /*
5601 * Unmovable means "not-on-lru" pages. If Unmovable pages are 5606 * Unmovable means "not-on-lru" pages. If Unmovable pages are
5602 * larger than removable-by-driver pages reported by notifier, 5607 * larger than removable-by-driver pages reported by notifier,
5603 * we'll fail. 5608 * we'll fail.
5604 */ 5609 */
5605 5610
5606 out: 5611 out:
5607 if (!ret) { 5612 if (!ret) {
5608 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5613 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5609 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5614 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5610 } 5615 }
5611 5616
5612 spin_unlock_irqrestore(&zone->lock, flags); 5617 spin_unlock_irqrestore(&zone->lock, flags);
5613 if (!ret) 5618 if (!ret)
5614 drain_all_pages(); 5619 drain_all_pages();
5615 return ret; 5620 return ret;
5616 } 5621 }
5617 5622
5618 void unset_migratetype_isolate(struct page *page, unsigned migratetype) 5623 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5619 { 5624 {
5620 struct zone *zone; 5625 struct zone *zone;
5621 unsigned long flags; 5626 unsigned long flags;
5622 zone = page_zone(page); 5627 zone = page_zone(page);
5623 spin_lock_irqsave(&zone->lock, flags); 5628 spin_lock_irqsave(&zone->lock, flags);
5624 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 5629 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5625 goto out; 5630 goto out;
5626 set_pageblock_migratetype(page, migratetype); 5631 set_pageblock_migratetype(page, migratetype);
5627 move_freepages_block(zone, page, migratetype); 5632 move_freepages_block(zone, page, migratetype);
5628 out: 5633 out:
5629 spin_unlock_irqrestore(&zone->lock, flags); 5634 spin_unlock_irqrestore(&zone->lock, flags);
5630 } 5635 }
5631 5636
5632 #ifdef CONFIG_CMA 5637 #ifdef CONFIG_CMA
5633 5638
5634 static unsigned long pfn_max_align_down(unsigned long pfn) 5639 static unsigned long pfn_max_align_down(unsigned long pfn)
5635 { 5640 {
5636 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 5641 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5637 pageblock_nr_pages) - 1); 5642 pageblock_nr_pages) - 1);
5638 } 5643 }
5639 5644
5640 static unsigned long pfn_max_align_up(unsigned long pfn) 5645 static unsigned long pfn_max_align_up(unsigned long pfn)
5641 { 5646 {
5642 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 5647 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5643 pageblock_nr_pages)); 5648 pageblock_nr_pages));
5644 } 5649 }
5645 5650
5646 static struct page * 5651 static struct page *
5647 __alloc_contig_migrate_alloc(struct page *page, unsigned long private, 5652 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5648 int **resultp) 5653 int **resultp)
5649 { 5654 {
5650 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 5655 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5651 5656
5652 if (PageHighMem(page)) 5657 if (PageHighMem(page))
5653 gfp_mask |= __GFP_HIGHMEM; 5658 gfp_mask |= __GFP_HIGHMEM;
5654 5659
5655 return alloc_page(gfp_mask); 5660 return alloc_page(gfp_mask);
5656 } 5661 }
5657 5662
5658 /* [start, end) must belong to a single zone. */ 5663 /* [start, end) must belong to a single zone. */
5659 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 5664 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5660 { 5665 {
5661 /* This function is based on compact_zone() from compaction.c. */ 5666 /* This function is based on compact_zone() from compaction.c. */
5662 5667
5663 unsigned long pfn = start; 5668 unsigned long pfn = start;
5664 unsigned int tries = 0; 5669 unsigned int tries = 0;
5665 int ret = 0; 5670 int ret = 0;
5666 5671
5667 struct compact_control cc = { 5672 struct compact_control cc = {
5668 .nr_migratepages = 0, 5673 .nr_migratepages = 0,
5669 .order = -1, 5674 .order = -1,
5670 .zone = page_zone(pfn_to_page(start)), 5675 .zone = page_zone(pfn_to_page(start)),
5671 .sync = true, 5676 .sync = true,
5672 }; 5677 };
5673 INIT_LIST_HEAD(&cc.migratepages); 5678 INIT_LIST_HEAD(&cc.migratepages);
5674 5679
5675 migrate_prep_local(); 5680 migrate_prep_local();
5676 5681
5677 while (pfn < end || !list_empty(&cc.migratepages)) { 5682 while (pfn < end || !list_empty(&cc.migratepages)) {
5678 if (fatal_signal_pending(current)) { 5683 if (fatal_signal_pending(current)) {
5679 ret = -EINTR; 5684 ret = -EINTR;
5680 break; 5685 break;
5681 } 5686 }
5682 5687
5683 if (list_empty(&cc.migratepages)) { 5688 if (list_empty(&cc.migratepages)) {
5684 cc.nr_migratepages = 0; 5689 cc.nr_migratepages = 0;
5685 pfn = isolate_migratepages_range(cc.zone, &cc, 5690 pfn = isolate_migratepages_range(cc.zone, &cc,
5686 pfn, end); 5691 pfn, end);
5687 if (!pfn) { 5692 if (!pfn) {
5688 ret = -EINTR; 5693 ret = -EINTR;
5689 break; 5694 break;
5690 } 5695 }
5691 tries = 0; 5696 tries = 0;
5692 } else if (++tries == 5) { 5697 } else if (++tries == 5) {
5693 ret = ret < 0 ? ret : -EBUSY; 5698 ret = ret < 0 ? ret : -EBUSY;
5694 break; 5699 break;
5695 } 5700 }
5696 5701
5697 ret = migrate_pages(&cc.migratepages, 5702 ret = migrate_pages(&cc.migratepages,
5698 __alloc_contig_migrate_alloc, 5703 __alloc_contig_migrate_alloc,
5699 0, false, MIGRATE_SYNC); 5704 0, false, MIGRATE_SYNC);
5700 } 5705 }
5701 5706
5702 putback_lru_pages(&cc.migratepages); 5707 putback_lru_pages(&cc.migratepages);
5703 return ret > 0 ? 0 : ret; 5708 return ret > 0 ? 0 : ret;
5704 } 5709 }
5705 5710
5706 /* 5711 /*
5707 * Update zone's cma pages counter used for watermark level calculation. 5712 * Update zone's cma pages counter used for watermark level calculation.
5708 */ 5713 */
5709 static inline void __update_cma_watermarks(struct zone *zone, int count) 5714 static inline void __update_cma_watermarks(struct zone *zone, int count)
5710 { 5715 {
5711 unsigned long flags; 5716 unsigned long flags;
5712 spin_lock_irqsave(&zone->lock, flags); 5717 spin_lock_irqsave(&zone->lock, flags);
5713 zone->min_cma_pages += count; 5718 zone->min_cma_pages += count;
5714 spin_unlock_irqrestore(&zone->lock, flags); 5719 spin_unlock_irqrestore(&zone->lock, flags);
5715 setup_per_zone_wmarks(); 5720 setup_per_zone_wmarks();
5716 } 5721 }
5717 5722
5718 /* 5723 /*
5719 * Trigger memory pressure bump to reclaim some pages in order to be able to 5724 * Trigger memory pressure bump to reclaim some pages in order to be able to
5720 * allocate 'count' pages in single page units. Does similar work as 5725 * allocate 'count' pages in single page units. Does similar work as
5721 *__alloc_pages_slowpath() function. 5726 *__alloc_pages_slowpath() function.
5722 */ 5727 */
5723 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) 5728 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5724 { 5729 {
5725 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 5730 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5726 struct zonelist *zonelist = node_zonelist(0, gfp_mask); 5731 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5727 int did_some_progress = 0; 5732 int did_some_progress = 0;
5728 int order = 1; 5733 int order = 1;
5729 5734
5730 /* 5735 /*
5731 * Increase level of watermarks to force kswapd do his job 5736 * Increase level of watermarks to force kswapd do his job
5732 * to stabilise at new watermark level. 5737 * to stabilise at new watermark level.
5733 */ 5738 */
5734 __update_cma_watermarks(zone, count); 5739 __update_cma_watermarks(zone, count);
5735 5740
5736 /* Obey watermarks as if the page was being allocated */ 5741 /* Obey watermarks as if the page was being allocated */
5737 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { 5742 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5738 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); 5743 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5739 5744
5740 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 5745 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5741 NULL); 5746 NULL);
5742 if (!did_some_progress) { 5747 if (!did_some_progress) {
5743 /* Exhausted what can be done so it's blamo time */ 5748 /* Exhausted what can be done so it's blamo time */
5744 out_of_memory(zonelist, gfp_mask, order, NULL, false); 5749 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5745 } 5750 }
5746 } 5751 }
5747 5752
5748 /* Restore original watermark levels. */ 5753 /* Restore original watermark levels. */
5749 __update_cma_watermarks(zone, -count); 5754 __update_cma_watermarks(zone, -count);
5750 5755
5751 return count; 5756 return count;
5752 } 5757 }
5753 5758
5754 /** 5759 /**
5755 * alloc_contig_range() -- tries to allocate given range of pages 5760 * alloc_contig_range() -- tries to allocate given range of pages
5756 * @start: start PFN to allocate 5761 * @start: start PFN to allocate
5757 * @end: one-past-the-last PFN to allocate 5762 * @end: one-past-the-last PFN to allocate
5758 * @migratetype: migratetype of the underlaying pageblocks (either 5763 * @migratetype: migratetype of the underlaying pageblocks (either
5759 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 5764 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5760 * in range must have the same migratetype and it must 5765 * in range must have the same migratetype and it must
5761 * be either of the two. 5766 * be either of the two.
5762 * 5767 *
5763 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 5768 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5764 * aligned, however it's the caller's responsibility to guarantee that 5769 * aligned, however it's the caller's responsibility to guarantee that
5765 * we are the only thread that changes migrate type of pageblocks the 5770 * we are the only thread that changes migrate type of pageblocks the
5766 * pages fall in. 5771 * pages fall in.
5767 * 5772 *
5768 * The PFN range must belong to a single zone. 5773 * The PFN range must belong to a single zone.
5769 * 5774 *
5770 * Returns zero on success or negative error code. On success all 5775 * Returns zero on success or negative error code. On success all
5771 * pages which PFN is in [start, end) are allocated for the caller and 5776 * pages which PFN is in [start, end) are allocated for the caller and
5772 * need to be freed with free_contig_range(). 5777 * need to be freed with free_contig_range().
5773 */ 5778 */
5774 int alloc_contig_range(unsigned long start, unsigned long end, 5779 int alloc_contig_range(unsigned long start, unsigned long end,
5775 unsigned migratetype) 5780 unsigned migratetype)
5776 { 5781 {
5777 struct zone *zone = page_zone(pfn_to_page(start)); 5782 struct zone *zone = page_zone(pfn_to_page(start));
5778 unsigned long outer_start, outer_end; 5783 unsigned long outer_start, outer_end;
5779 int ret = 0, order; 5784 int ret = 0, order;
5780 5785
5781 /* 5786 /*
5782 * What we do here is we mark all pageblocks in range as 5787 * What we do here is we mark all pageblocks in range as
5783 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5788 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5784 * have different sizes, and due to the way page allocator 5789 * have different sizes, and due to the way page allocator
5785 * work, we align the range to biggest of the two pages so 5790 * work, we align the range to biggest of the two pages so
5786 * that page allocator won't try to merge buddies from 5791 * that page allocator won't try to merge buddies from
5787 * different pageblocks and change MIGRATE_ISOLATE to some 5792 * different pageblocks and change MIGRATE_ISOLATE to some
5788 * other migration type. 5793 * other migration type.
5789 * 5794 *
5790 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 5795 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5791 * migrate the pages from an unaligned range (ie. pages that 5796 * migrate the pages from an unaligned range (ie. pages that
5792 * we are interested in). This will put all the pages in 5797 * we are interested in). This will put all the pages in
5793 * range back to page allocator as MIGRATE_ISOLATE. 5798 * range back to page allocator as MIGRATE_ISOLATE.
5794 * 5799 *
5795 * When this is done, we take the pages in range from page 5800 * When this is done, we take the pages in range from page
5796 * allocator removing them from the buddy system. This way 5801 * allocator removing them from the buddy system. This way
5797 * page allocator will never consider using them. 5802 * page allocator will never consider using them.
5798 * 5803 *
5799 * This lets us mark the pageblocks back as 5804 * This lets us mark the pageblocks back as
5800 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 5805 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5801 * aligned range but not in the unaligned, original range are 5806 * aligned range but not in the unaligned, original range are
5802 * put back to page allocator so that buddy can use them. 5807 * put back to page allocator so that buddy can use them.
5803 */ 5808 */
5804 5809
5805 ret = start_isolate_page_range(pfn_max_align_down(start), 5810 ret = start_isolate_page_range(pfn_max_align_down(start),
5806 pfn_max_align_up(end), migratetype); 5811 pfn_max_align_up(end), migratetype);
5807 if (ret) 5812 if (ret)
5808 goto done; 5813 goto done;
5809 5814
5810 ret = __alloc_contig_migrate_range(start, end); 5815 ret = __alloc_contig_migrate_range(start, end);
5811 if (ret) 5816 if (ret)
5812 goto done; 5817 goto done;
5813 5818
5814 /* 5819 /*
5815 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 5820 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5816 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 5821 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5817 * more, all pages in [start, end) are free in page allocator. 5822 * more, all pages in [start, end) are free in page allocator.
5818 * What we are going to do is to allocate all pages from 5823 * What we are going to do is to allocate all pages from
5819 * [start, end) (that is remove them from page allocator). 5824 * [start, end) (that is remove them from page allocator).
5820 * 5825 *
5821 * The only problem is that pages at the beginning and at the 5826 * The only problem is that pages at the beginning and at the
5822 * end of interesting range may be not aligned with pages that 5827 * end of interesting range may be not aligned with pages that
5823 * page allocator holds, ie. they can be part of higher order 5828 * page allocator holds, ie. they can be part of higher order
5824 * pages. Because of this, we reserve the bigger range and 5829 * pages. Because of this, we reserve the bigger range and
5825 * once this is done free the pages we are not interested in. 5830 * once this is done free the pages we are not interested in.
5826 * 5831 *
5827 * We don't have to hold zone->lock here because the pages are 5832 * We don't have to hold zone->lock here because the pages are
5828 * isolated thus they won't get removed from buddy. 5833 * isolated thus they won't get removed from buddy.
5829 */ 5834 */
5830 5835
5831 lru_add_drain_all(); 5836 lru_add_drain_all();
5832 drain_all_pages(); 5837 drain_all_pages();
5833 5838
5834 order = 0; 5839 order = 0;
5835 outer_start = start; 5840 outer_start = start;
5836 while (!PageBuddy(pfn_to_page(outer_start))) { 5841 while (!PageBuddy(pfn_to_page(outer_start))) {
5837 if (++order >= MAX_ORDER) { 5842 if (++order >= MAX_ORDER) {
5838 ret = -EBUSY; 5843 ret = -EBUSY;
5839 goto done; 5844 goto done;
5840 } 5845 }
5841 outer_start &= ~0UL << order; 5846 outer_start &= ~0UL << order;
5842 } 5847 }
5843 5848
5844 /* Make sure the range is really isolated. */ 5849 /* Make sure the range is really isolated. */
5845 if (test_pages_isolated(outer_start, end)) { 5850 if (test_pages_isolated(outer_start, end)) {
5846 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5851 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5847 outer_start, end); 5852 outer_start, end);
5848 ret = -EBUSY; 5853 ret = -EBUSY;
5849 goto done; 5854 goto done;
5850 } 5855 }
5851 5856
5852 /* 5857 /*
5853 * Reclaim enough pages to make sure that contiguous allocation 5858 * Reclaim enough pages to make sure that contiguous allocation
5854 * will not starve the system. 5859 * will not starve the system.
5855 */ 5860 */
5856 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 5861 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5857 5862
5858 /* Grab isolated pages from freelists. */ 5863 /* Grab isolated pages from freelists. */
5859 outer_end = isolate_freepages_range(outer_start, end); 5864 outer_end = isolate_freepages_range(outer_start, end);
5860 if (!outer_end) { 5865 if (!outer_end) {
5861 ret = -EBUSY; 5866 ret = -EBUSY;
5862 goto done; 5867 goto done;
5863 } 5868 }
5864 5869
5865 /* Free head and tail (if any) */ 5870 /* Free head and tail (if any) */
5866 if (start != outer_start) 5871 if (start != outer_start)
5867 free_contig_range(outer_start, start - outer_start); 5872 free_contig_range(outer_start, start - outer_start);
5868 if (end != outer_end) 5873 if (end != outer_end)
5869 free_contig_range(end, outer_end - end); 5874 free_contig_range(end, outer_end - end);
5870 5875
5871 done: 5876 done:
5872 undo_isolate_page_range(pfn_max_align_down(start), 5877 undo_isolate_page_range(pfn_max_align_down(start),
5873 pfn_max_align_up(end), migratetype); 5878 pfn_max_align_up(end), migratetype);
5874 return ret; 5879 return ret;
5875 } 5880 }
5876 5881
5877 void free_contig_range(unsigned long pfn, unsigned nr_pages) 5882 void free_contig_range(unsigned long pfn, unsigned nr_pages)
5878 { 5883 {
5879 for (; nr_pages--; ++pfn) 5884 for (; nr_pages--; ++pfn)
5880 __free_page(pfn_to_page(pfn)); 5885 __free_page(pfn_to_page(pfn));
5881 } 5886 }
5882 #endif 5887 #endif
5883 5888
5884 #ifdef CONFIG_MEMORY_HOTREMOVE 5889 #ifdef CONFIG_MEMORY_HOTREMOVE
5885 /* 5890 /*
5886 * All pages in the range must be isolated before calling this. 5891 * All pages in the range must be isolated before calling this.
5887 */ 5892 */
5888 void 5893 void
5889 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 5894 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5890 { 5895 {
5891 struct page *page; 5896 struct page *page;
5892 struct zone *zone; 5897 struct zone *zone;
5893 int order, i; 5898 int order, i;
5894 unsigned long pfn; 5899 unsigned long pfn;
5895 unsigned long flags; 5900 unsigned long flags;
5896 /* find the first valid pfn */ 5901 /* find the first valid pfn */
5897 for (pfn = start_pfn; pfn < end_pfn; pfn++) 5902 for (pfn = start_pfn; pfn < end_pfn; pfn++)
5898 if (pfn_valid(pfn)) 5903 if (pfn_valid(pfn))
5899 break; 5904 break;
5900 if (pfn == end_pfn) 5905 if (pfn == end_pfn)
5901 return; 5906 return;
5902 zone = page_zone(pfn_to_page(pfn)); 5907 zone = page_zone(pfn_to_page(pfn));
5903 spin_lock_irqsave(&zone->lock, flags); 5908 spin_lock_irqsave(&zone->lock, flags);
5904 pfn = start_pfn; 5909 pfn = start_pfn;
5905 while (pfn < end_pfn) { 5910 while (pfn < end_pfn) {
5906 if (!pfn_valid(pfn)) { 5911 if (!pfn_valid(pfn)) {
5907 pfn++; 5912 pfn++;
5908 continue; 5913 continue;
5909 } 5914 }
5910 page = pfn_to_page(pfn); 5915 page = pfn_to_page(pfn);
5911 BUG_ON(page_count(page)); 5916 BUG_ON(page_count(page));
5912 BUG_ON(!PageBuddy(page)); 5917 BUG_ON(!PageBuddy(page));
5913 order = page_order(page); 5918 order = page_order(page);
5914 #ifdef CONFIG_DEBUG_VM 5919 #ifdef CONFIG_DEBUG_VM
5915 printk(KERN_INFO "remove from free list %lx %d %lx\n", 5920 printk(KERN_INFO "remove from free list %lx %d %lx\n",
5916 pfn, 1 << order, end_pfn); 5921 pfn, 1 << order, end_pfn);
5917 #endif 5922 #endif
5918 list_del(&page->lru); 5923 list_del(&page->lru);
5919 rmv_page_order(page); 5924 rmv_page_order(page);
5920 zone->free_area[order].nr_free--; 5925 zone->free_area[order].nr_free--;
5921 __mod_zone_page_state(zone, NR_FREE_PAGES, 5926 __mod_zone_page_state(zone, NR_FREE_PAGES,
5922 - (1UL << order)); 5927 - (1UL << order));
5923 for (i = 0; i < (1 << order); i++) 5928 for (i = 0; i < (1 << order); i++)
5924 SetPageReserved((page+i)); 5929 SetPageReserved((page+i));
5925 pfn += (1 << order); 5930 pfn += (1 << order);
5926 } 5931 }
5927 spin_unlock_irqrestore(&zone->lock, flags); 5932 spin_unlock_irqrestore(&zone->lock, flags);
5928 } 5933 }
5929 #endif 5934 #endif
5930 5935
5931 #ifdef CONFIG_MEMORY_FAILURE 5936 #ifdef CONFIG_MEMORY_FAILURE
5932 bool is_free_buddy_page(struct page *page) 5937 bool is_free_buddy_page(struct page *page)
5933 { 5938 {
5934 struct zone *zone = page_zone(page); 5939 struct zone *zone = page_zone(page);
5935 unsigned long pfn = page_to_pfn(page); 5940 unsigned long pfn = page_to_pfn(page);
5936 unsigned long flags; 5941 unsigned long flags;
5937 int order; 5942 int order;
5938 5943
5939 spin_lock_irqsave(&zone->lock, flags); 5944 spin_lock_irqsave(&zone->lock, flags);
5940 for (order = 0; order < MAX_ORDER; order++) { 5945 for (order = 0; order < MAX_ORDER; order++) {
5941 struct page *page_head = page - (pfn & ((1 << order) - 1)); 5946 struct page *page_head = page - (pfn & ((1 << order) - 1));
5942 5947
5943 if (PageBuddy(page_head) && page_order(page_head) >= order) 5948 if (PageBuddy(page_head) && page_order(page_head) >= order)
5944 break; 5949 break;
5945 } 5950 }
5946 spin_unlock_irqrestore(&zone->lock, flags); 5951 spin_unlock_irqrestore(&zone->lock, flags);
5947 5952
5948 return order < MAX_ORDER; 5953 return order < MAX_ORDER;
5949 } 5954 }
5950 #endif 5955 #endif
5951 5956
5952 static const struct trace_print_flags pageflag_names[] = { 5957 static const struct trace_print_flags pageflag_names[] = {
5953 {1UL << PG_locked, "locked" }, 5958 {1UL << PG_locked, "locked" },
5954 {1UL << PG_error, "error" }, 5959 {1UL << PG_error, "error" },
5955 {1UL << PG_referenced, "referenced" }, 5960 {1UL << PG_referenced, "referenced" },
5956 {1UL << PG_uptodate, "uptodate" }, 5961 {1UL << PG_uptodate, "uptodate" },
5957 {1UL << PG_dirty, "dirty" }, 5962 {1UL << PG_dirty, "dirty" },
5958 {1UL << PG_lru, "lru" }, 5963 {1UL << PG_lru, "lru" },
5959 {1UL << PG_active, "active" }, 5964 {1UL << PG_active, "active" },
5960 {1UL << PG_slab, "slab" }, 5965 {1UL << PG_slab, "slab" },
5961 {1UL << PG_owner_priv_1, "owner_priv_1" }, 5966 {1UL << PG_owner_priv_1, "owner_priv_1" },
5962 {1UL << PG_arch_1, "arch_1" }, 5967 {1UL << PG_arch_1, "arch_1" },
5963 {1UL << PG_reserved, "reserved" }, 5968 {1UL << PG_reserved, "reserved" },
5964 {1UL << PG_private, "private" }, 5969 {1UL << PG_private, "private" },
5965 {1UL << PG_private_2, "private_2" }, 5970 {1UL << PG_private_2, "private_2" },
5966 {1UL << PG_writeback, "writeback" }, 5971 {1UL << PG_writeback, "writeback" },
5967 #ifdef CONFIG_PAGEFLAGS_EXTENDED 5972 #ifdef CONFIG_PAGEFLAGS_EXTENDED
5968 {1UL << PG_head, "head" }, 5973 {1UL << PG_head, "head" },
5969 {1UL << PG_tail, "tail" }, 5974 {1UL << PG_tail, "tail" },
5970 #else 5975 #else
5971 {1UL << PG_compound, "compound" }, 5976 {1UL << PG_compound, "compound" },
5972 #endif 5977 #endif
5973 {1UL << PG_swapcache, "swapcache" }, 5978 {1UL << PG_swapcache, "swapcache" },
5974 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5979 {1UL << PG_mappedtodisk, "mappedtodisk" },
5975 {1UL << PG_reclaim, "reclaim" }, 5980 {1UL << PG_reclaim, "reclaim" },
5976 {1UL << PG_swapbacked, "swapbacked" }, 5981 {1UL << PG_swapbacked, "swapbacked" },
5977 {1UL << PG_unevictable, "unevictable" }, 5982 {1UL << PG_unevictable, "unevictable" },
5978 #ifdef CONFIG_MMU 5983 #ifdef CONFIG_MMU
5979 {1UL << PG_mlocked, "mlocked" }, 5984 {1UL << PG_mlocked, "mlocked" },
5980 #endif 5985 #endif
5981 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 5986 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
5982 {1UL << PG_uncached, "uncached" }, 5987 {1UL << PG_uncached, "uncached" },
5983 #endif 5988 #endif
5984 #ifdef CONFIG_MEMORY_FAILURE 5989 #ifdef CONFIG_MEMORY_FAILURE
5985 {1UL << PG_hwpoison, "hwpoison" }, 5990 {1UL << PG_hwpoison, "hwpoison" },
5986 #endif 5991 #endif
5987 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5992 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5988 {1UL << PG_compound_lock, "compound_lock" }, 5993 {1UL << PG_compound_lock, "compound_lock" },
5989 #endif 5994 #endif
5990 }; 5995 };
5991 5996
5992 static void dump_page_flags(unsigned long flags) 5997 static void dump_page_flags(unsigned long flags)
5993 { 5998 {
5994 const char *delim = ""; 5999 const char *delim = "";
5995 unsigned long mask; 6000 unsigned long mask;
5996 int i; 6001 int i;
5997 6002
5998 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6003 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
5999 6004
6000 printk(KERN_ALERT "page flags: %#lx(", flags); 6005 printk(KERN_ALERT "page flags: %#lx(", flags);
6001 6006
6002 /* remove zone id */ 6007 /* remove zone id */
6003 flags &= (1UL << NR_PAGEFLAGS) - 1; 6008 flags &= (1UL << NR_PAGEFLAGS) - 1;
6004 6009
6005 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6010 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6006 6011
6007 mask = pageflag_names[i].mask; 6012 mask = pageflag_names[i].mask;
6008 if ((flags & mask) != mask) 6013 if ((flags & mask) != mask)
6009 continue; 6014 continue;
6010 6015
6011 flags &= ~mask; 6016 flags &= ~mask;
6012 printk("%s%s", delim, pageflag_names[i].name); 6017 printk("%s%s", delim, pageflag_names[i].name);
6013 delim = "|"; 6018 delim = "|";
6014 } 6019 }
6015 6020
6016 /* check for left over flags */ 6021 /* check for left over flags */
6017 if (flags) 6022 if (flags)
6018 printk("%s%#lx", delim, flags); 6023 printk("%s%#lx", delim, flags);
6019 6024
6020 printk(")\n"); 6025 printk(")\n");
6021 } 6026 }
6022 6027
6023 void dump_page(struct page *page) 6028 void dump_page(struct page *page)
6024 { 6029 {
6025 printk(KERN_ALERT 6030 printk(KERN_ALERT
6026 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6031 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6027 page, atomic_read(&page->_count), page_mapcount(page), 6032 page, atomic_read(&page->_count), page_mapcount(page),
6028 page->mapping, page->index); 6033 page->mapping, page->index);
6029 dump_page_flags(page->flags); 6034 dump_page_flags(page->flags);
6030 mem_cgroup_print_bad_page(page); 6035 mem_cgroup_print_bad_page(page);
6031 } 6036 }
6032 6037