Commit 2a6f512412c7aecd04134721ea392cc496e6c017

Authored by Srinivas Pandruvada
Committed by Linus Torvalds
1 parent ffb22af5b7

CMA: make putback_lru_pages() call conditional

As per documentation and other places calling putback_lru_pages(),
putback_lru_pages() is called on error only.  Make the CMA code behave
consistently.

[akpm@linux-foundation.org: remove a test-n-branch in the wrapup code]
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 5 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/migrate.h> 59 #include <linux/migrate.h>
60 #include <linux/page-debug-flags.h> 60 #include <linux/page-debug-flags.h>
61 #include <linux/sched/rt.h> 61 #include <linux/sched/rt.h>
62 62
63 #include <asm/tlbflush.h> 63 #include <asm/tlbflush.h>
64 #include <asm/div64.h> 64 #include <asm/div64.h>
65 #include "internal.h" 65 #include "internal.h"
66 66
67 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 67 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
68 DEFINE_PER_CPU(int, numa_node); 68 DEFINE_PER_CPU(int, numa_node);
69 EXPORT_PER_CPU_SYMBOL(numa_node); 69 EXPORT_PER_CPU_SYMBOL(numa_node);
70 #endif 70 #endif
71 71
72 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 72 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
73 /* 73 /*
74 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 74 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
75 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 75 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
76 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 76 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
77 * defined in <linux/topology.h>. 77 * defined in <linux/topology.h>.
78 */ 78 */
79 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 79 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
80 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 80 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
81 #endif 81 #endif
82 82
83 /* 83 /*
84 * Array of node states. 84 * Array of node states.
85 */ 85 */
86 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 86 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
87 [N_POSSIBLE] = NODE_MASK_ALL, 87 [N_POSSIBLE] = NODE_MASK_ALL,
88 [N_ONLINE] = { { [0] = 1UL } }, 88 [N_ONLINE] = { { [0] = 1UL } },
89 #ifndef CONFIG_NUMA 89 #ifndef CONFIG_NUMA
90 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 90 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
91 #ifdef CONFIG_HIGHMEM 91 #ifdef CONFIG_HIGHMEM
92 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 92 [N_HIGH_MEMORY] = { { [0] = 1UL } },
93 #endif 93 #endif
94 #ifdef CONFIG_MOVABLE_NODE 94 #ifdef CONFIG_MOVABLE_NODE
95 [N_MEMORY] = { { [0] = 1UL } }, 95 [N_MEMORY] = { { [0] = 1UL } },
96 #endif 96 #endif
97 [N_CPU] = { { [0] = 1UL } }, 97 [N_CPU] = { { [0] = 1UL } },
98 #endif /* NUMA */ 98 #endif /* NUMA */
99 }; 99 };
100 EXPORT_SYMBOL(node_states); 100 EXPORT_SYMBOL(node_states);
101 101
102 unsigned long totalram_pages __read_mostly; 102 unsigned long totalram_pages __read_mostly;
103 unsigned long totalreserve_pages __read_mostly; 103 unsigned long totalreserve_pages __read_mostly;
104 /* 104 /*
105 * When calculating the number of globally allowed dirty pages, there 105 * When calculating the number of globally allowed dirty pages, there
106 * is a certain number of per-zone reserves that should not be 106 * is a certain number of per-zone reserves that should not be
107 * considered dirtyable memory. This is the sum of those reserves 107 * considered dirtyable memory. This is the sum of those reserves
108 * over all existing zones that contribute dirtyable memory. 108 * over all existing zones that contribute dirtyable memory.
109 */ 109 */
110 unsigned long dirty_balance_reserve __read_mostly; 110 unsigned long dirty_balance_reserve __read_mostly;
111 111
112 int percpu_pagelist_fraction; 112 int percpu_pagelist_fraction;
113 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 113 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
114 114
115 #ifdef CONFIG_PM_SLEEP 115 #ifdef CONFIG_PM_SLEEP
116 /* 116 /*
117 * The following functions are used by the suspend/hibernate code to temporarily 117 * The following functions are used by the suspend/hibernate code to temporarily
118 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 118 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
119 * while devices are suspended. To avoid races with the suspend/hibernate code, 119 * while devices are suspended. To avoid races with the suspend/hibernate code,
120 * they should always be called with pm_mutex held (gfp_allowed_mask also should 120 * they should always be called with pm_mutex held (gfp_allowed_mask also should
121 * only be modified with pm_mutex held, unless the suspend/hibernate code is 121 * only be modified with pm_mutex held, unless the suspend/hibernate code is
122 * guaranteed not to run in parallel with that modification). 122 * guaranteed not to run in parallel with that modification).
123 */ 123 */
124 124
125 static gfp_t saved_gfp_mask; 125 static gfp_t saved_gfp_mask;
126 126
127 void pm_restore_gfp_mask(void) 127 void pm_restore_gfp_mask(void)
128 { 128 {
129 WARN_ON(!mutex_is_locked(&pm_mutex)); 129 WARN_ON(!mutex_is_locked(&pm_mutex));
130 if (saved_gfp_mask) { 130 if (saved_gfp_mask) {
131 gfp_allowed_mask = saved_gfp_mask; 131 gfp_allowed_mask = saved_gfp_mask;
132 saved_gfp_mask = 0; 132 saved_gfp_mask = 0;
133 } 133 }
134 } 134 }
135 135
136 void pm_restrict_gfp_mask(void) 136 void pm_restrict_gfp_mask(void)
137 { 137 {
138 WARN_ON(!mutex_is_locked(&pm_mutex)); 138 WARN_ON(!mutex_is_locked(&pm_mutex));
139 WARN_ON(saved_gfp_mask); 139 WARN_ON(saved_gfp_mask);
140 saved_gfp_mask = gfp_allowed_mask; 140 saved_gfp_mask = gfp_allowed_mask;
141 gfp_allowed_mask &= ~GFP_IOFS; 141 gfp_allowed_mask &= ~GFP_IOFS;
142 } 142 }
143 143
144 bool pm_suspended_storage(void) 144 bool pm_suspended_storage(void)
145 { 145 {
146 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 146 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
147 return false; 147 return false;
148 return true; 148 return true;
149 } 149 }
150 #endif /* CONFIG_PM_SLEEP */ 150 #endif /* CONFIG_PM_SLEEP */
151 151
152 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 152 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
153 int pageblock_order __read_mostly; 153 int pageblock_order __read_mostly;
154 #endif 154 #endif
155 155
156 static void __free_pages_ok(struct page *page, unsigned int order); 156 static void __free_pages_ok(struct page *page, unsigned int order);
157 157
158 /* 158 /*
159 * results with 256, 32 in the lowmem_reserve sysctl: 159 * results with 256, 32 in the lowmem_reserve sysctl:
160 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 160 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
161 * 1G machine -> (16M dma, 784M normal, 224M high) 161 * 1G machine -> (16M dma, 784M normal, 224M high)
162 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 162 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
163 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 163 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
164 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 164 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
165 * 165 *
166 * TBD: should special case ZONE_DMA32 machines here - in those we normally 166 * TBD: should special case ZONE_DMA32 machines here - in those we normally
167 * don't need any ZONE_NORMAL reservation 167 * don't need any ZONE_NORMAL reservation
168 */ 168 */
169 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 169 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
170 #ifdef CONFIG_ZONE_DMA 170 #ifdef CONFIG_ZONE_DMA
171 256, 171 256,
172 #endif 172 #endif
173 #ifdef CONFIG_ZONE_DMA32 173 #ifdef CONFIG_ZONE_DMA32
174 256, 174 256,
175 #endif 175 #endif
176 #ifdef CONFIG_HIGHMEM 176 #ifdef CONFIG_HIGHMEM
177 32, 177 32,
178 #endif 178 #endif
179 32, 179 32,
180 }; 180 };
181 181
182 EXPORT_SYMBOL(totalram_pages); 182 EXPORT_SYMBOL(totalram_pages);
183 183
184 static char * const zone_names[MAX_NR_ZONES] = { 184 static char * const zone_names[MAX_NR_ZONES] = {
185 #ifdef CONFIG_ZONE_DMA 185 #ifdef CONFIG_ZONE_DMA
186 "DMA", 186 "DMA",
187 #endif 187 #endif
188 #ifdef CONFIG_ZONE_DMA32 188 #ifdef CONFIG_ZONE_DMA32
189 "DMA32", 189 "DMA32",
190 #endif 190 #endif
191 "Normal", 191 "Normal",
192 #ifdef CONFIG_HIGHMEM 192 #ifdef CONFIG_HIGHMEM
193 "HighMem", 193 "HighMem",
194 #endif 194 #endif
195 "Movable", 195 "Movable",
196 }; 196 };
197 197
198 int min_free_kbytes = 1024; 198 int min_free_kbytes = 1024;
199 199
200 static unsigned long __meminitdata nr_kernel_pages; 200 static unsigned long __meminitdata nr_kernel_pages;
201 static unsigned long __meminitdata nr_all_pages; 201 static unsigned long __meminitdata nr_all_pages;
202 static unsigned long __meminitdata dma_reserve; 202 static unsigned long __meminitdata dma_reserve;
203 203
204 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 204 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
205 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 205 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
206 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 206 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
207 static unsigned long __initdata required_kernelcore; 207 static unsigned long __initdata required_kernelcore;
208 static unsigned long __initdata required_movablecore; 208 static unsigned long __initdata required_movablecore;
209 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 209 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
210 210
211 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 211 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
212 int movable_zone; 212 int movable_zone;
213 EXPORT_SYMBOL(movable_zone); 213 EXPORT_SYMBOL(movable_zone);
214 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 214 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
215 215
216 #if MAX_NUMNODES > 1 216 #if MAX_NUMNODES > 1
217 int nr_node_ids __read_mostly = MAX_NUMNODES; 217 int nr_node_ids __read_mostly = MAX_NUMNODES;
218 int nr_online_nodes __read_mostly = 1; 218 int nr_online_nodes __read_mostly = 1;
219 EXPORT_SYMBOL(nr_node_ids); 219 EXPORT_SYMBOL(nr_node_ids);
220 EXPORT_SYMBOL(nr_online_nodes); 220 EXPORT_SYMBOL(nr_online_nodes);
221 #endif 221 #endif
222 222
223 int page_group_by_mobility_disabled __read_mostly; 223 int page_group_by_mobility_disabled __read_mostly;
224 224
225 void set_pageblock_migratetype(struct page *page, int migratetype) 225 void set_pageblock_migratetype(struct page *page, int migratetype)
226 { 226 {
227 227
228 if (unlikely(page_group_by_mobility_disabled)) 228 if (unlikely(page_group_by_mobility_disabled))
229 migratetype = MIGRATE_UNMOVABLE; 229 migratetype = MIGRATE_UNMOVABLE;
230 230
231 set_pageblock_flags_group(page, (unsigned long)migratetype, 231 set_pageblock_flags_group(page, (unsigned long)migratetype,
232 PB_migrate, PB_migrate_end); 232 PB_migrate, PB_migrate_end);
233 } 233 }
234 234
235 bool oom_killer_disabled __read_mostly; 235 bool oom_killer_disabled __read_mostly;
236 236
237 #ifdef CONFIG_DEBUG_VM 237 #ifdef CONFIG_DEBUG_VM
238 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 238 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
239 { 239 {
240 int ret = 0; 240 int ret = 0;
241 unsigned seq; 241 unsigned seq;
242 unsigned long pfn = page_to_pfn(page); 242 unsigned long pfn = page_to_pfn(page);
243 243
244 do { 244 do {
245 seq = zone_span_seqbegin(zone); 245 seq = zone_span_seqbegin(zone);
246 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 246 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
247 ret = 1; 247 ret = 1;
248 else if (pfn < zone->zone_start_pfn) 248 else if (pfn < zone->zone_start_pfn)
249 ret = 1; 249 ret = 1;
250 } while (zone_span_seqretry(zone, seq)); 250 } while (zone_span_seqretry(zone, seq));
251 251
252 return ret; 252 return ret;
253 } 253 }
254 254
255 static int page_is_consistent(struct zone *zone, struct page *page) 255 static int page_is_consistent(struct zone *zone, struct page *page)
256 { 256 {
257 if (!pfn_valid_within(page_to_pfn(page))) 257 if (!pfn_valid_within(page_to_pfn(page)))
258 return 0; 258 return 0;
259 if (zone != page_zone(page)) 259 if (zone != page_zone(page))
260 return 0; 260 return 0;
261 261
262 return 1; 262 return 1;
263 } 263 }
264 /* 264 /*
265 * Temporary debugging check for pages not lying within a given zone. 265 * Temporary debugging check for pages not lying within a given zone.
266 */ 266 */
267 static int bad_range(struct zone *zone, struct page *page) 267 static int bad_range(struct zone *zone, struct page *page)
268 { 268 {
269 if (page_outside_zone_boundaries(zone, page)) 269 if (page_outside_zone_boundaries(zone, page))
270 return 1; 270 return 1;
271 if (!page_is_consistent(zone, page)) 271 if (!page_is_consistent(zone, page))
272 return 1; 272 return 1;
273 273
274 return 0; 274 return 0;
275 } 275 }
276 #else 276 #else
277 static inline int bad_range(struct zone *zone, struct page *page) 277 static inline int bad_range(struct zone *zone, struct page *page)
278 { 278 {
279 return 0; 279 return 0;
280 } 280 }
281 #endif 281 #endif
282 282
283 static void bad_page(struct page *page) 283 static void bad_page(struct page *page)
284 { 284 {
285 static unsigned long resume; 285 static unsigned long resume;
286 static unsigned long nr_shown; 286 static unsigned long nr_shown;
287 static unsigned long nr_unshown; 287 static unsigned long nr_unshown;
288 288
289 /* Don't complain about poisoned pages */ 289 /* Don't complain about poisoned pages */
290 if (PageHWPoison(page)) { 290 if (PageHWPoison(page)) {
291 reset_page_mapcount(page); /* remove PageBuddy */ 291 reset_page_mapcount(page); /* remove PageBuddy */
292 return; 292 return;
293 } 293 }
294 294
295 /* 295 /*
296 * Allow a burst of 60 reports, then keep quiet for that minute; 296 * Allow a burst of 60 reports, then keep quiet for that minute;
297 * or allow a steady drip of one report per second. 297 * or allow a steady drip of one report per second.
298 */ 298 */
299 if (nr_shown == 60) { 299 if (nr_shown == 60) {
300 if (time_before(jiffies, resume)) { 300 if (time_before(jiffies, resume)) {
301 nr_unshown++; 301 nr_unshown++;
302 goto out; 302 goto out;
303 } 303 }
304 if (nr_unshown) { 304 if (nr_unshown) {
305 printk(KERN_ALERT 305 printk(KERN_ALERT
306 "BUG: Bad page state: %lu messages suppressed\n", 306 "BUG: Bad page state: %lu messages suppressed\n",
307 nr_unshown); 307 nr_unshown);
308 nr_unshown = 0; 308 nr_unshown = 0;
309 } 309 }
310 nr_shown = 0; 310 nr_shown = 0;
311 } 311 }
312 if (nr_shown++ == 0) 312 if (nr_shown++ == 0)
313 resume = jiffies + 60 * HZ; 313 resume = jiffies + 60 * HZ;
314 314
315 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 315 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
316 current->comm, page_to_pfn(page)); 316 current->comm, page_to_pfn(page));
317 dump_page(page); 317 dump_page(page);
318 318
319 print_modules(); 319 print_modules();
320 dump_stack(); 320 dump_stack();
321 out: 321 out:
322 /* Leave bad fields for debug, except PageBuddy could make trouble */ 322 /* Leave bad fields for debug, except PageBuddy could make trouble */
323 reset_page_mapcount(page); /* remove PageBuddy */ 323 reset_page_mapcount(page); /* remove PageBuddy */
324 add_taint(TAINT_BAD_PAGE); 324 add_taint(TAINT_BAD_PAGE);
325 } 325 }
326 326
327 /* 327 /*
328 * Higher-order pages are called "compound pages". They are structured thusly: 328 * Higher-order pages are called "compound pages". They are structured thusly:
329 * 329 *
330 * The first PAGE_SIZE page is called the "head page". 330 * The first PAGE_SIZE page is called the "head page".
331 * 331 *
332 * The remaining PAGE_SIZE pages are called "tail pages". 332 * The remaining PAGE_SIZE pages are called "tail pages".
333 * 333 *
334 * All pages have PG_compound set. All tail pages have their ->first_page 334 * All pages have PG_compound set. All tail pages have their ->first_page
335 * pointing at the head page. 335 * pointing at the head page.
336 * 336 *
337 * The first tail page's ->lru.next holds the address of the compound page's 337 * The first tail page's ->lru.next holds the address of the compound page's
338 * put_page() function. Its ->lru.prev holds the order of allocation. 338 * put_page() function. Its ->lru.prev holds the order of allocation.
339 * This usage means that zero-order pages may not be compound. 339 * This usage means that zero-order pages may not be compound.
340 */ 340 */
341 341
342 static void free_compound_page(struct page *page) 342 static void free_compound_page(struct page *page)
343 { 343 {
344 __free_pages_ok(page, compound_order(page)); 344 __free_pages_ok(page, compound_order(page));
345 } 345 }
346 346
347 void prep_compound_page(struct page *page, unsigned long order) 347 void prep_compound_page(struct page *page, unsigned long order)
348 { 348 {
349 int i; 349 int i;
350 int nr_pages = 1 << order; 350 int nr_pages = 1 << order;
351 351
352 set_compound_page_dtor(page, free_compound_page); 352 set_compound_page_dtor(page, free_compound_page);
353 set_compound_order(page, order); 353 set_compound_order(page, order);
354 __SetPageHead(page); 354 __SetPageHead(page);
355 for (i = 1; i < nr_pages; i++) { 355 for (i = 1; i < nr_pages; i++) {
356 struct page *p = page + i; 356 struct page *p = page + i;
357 __SetPageTail(p); 357 __SetPageTail(p);
358 set_page_count(p, 0); 358 set_page_count(p, 0);
359 p->first_page = page; 359 p->first_page = page;
360 } 360 }
361 } 361 }
362 362
363 /* update __split_huge_page_refcount if you change this function */ 363 /* update __split_huge_page_refcount if you change this function */
364 static int destroy_compound_page(struct page *page, unsigned long order) 364 static int destroy_compound_page(struct page *page, unsigned long order)
365 { 365 {
366 int i; 366 int i;
367 int nr_pages = 1 << order; 367 int nr_pages = 1 << order;
368 int bad = 0; 368 int bad = 0;
369 369
370 if (unlikely(compound_order(page) != order)) { 370 if (unlikely(compound_order(page) != order)) {
371 bad_page(page); 371 bad_page(page);
372 bad++; 372 bad++;
373 } 373 }
374 374
375 __ClearPageHead(page); 375 __ClearPageHead(page);
376 376
377 for (i = 1; i < nr_pages; i++) { 377 for (i = 1; i < nr_pages; i++) {
378 struct page *p = page + i; 378 struct page *p = page + i;
379 379
380 if (unlikely(!PageTail(p) || (p->first_page != page))) { 380 if (unlikely(!PageTail(p) || (p->first_page != page))) {
381 bad_page(page); 381 bad_page(page);
382 bad++; 382 bad++;
383 } 383 }
384 __ClearPageTail(p); 384 __ClearPageTail(p);
385 } 385 }
386 386
387 return bad; 387 return bad;
388 } 388 }
389 389
390 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 390 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
391 { 391 {
392 int i; 392 int i;
393 393
394 /* 394 /*
395 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 395 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
396 * and __GFP_HIGHMEM from hard or soft interrupt context. 396 * and __GFP_HIGHMEM from hard or soft interrupt context.
397 */ 397 */
398 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 398 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
399 for (i = 0; i < (1 << order); i++) 399 for (i = 0; i < (1 << order); i++)
400 clear_highpage(page + i); 400 clear_highpage(page + i);
401 } 401 }
402 402
403 #ifdef CONFIG_DEBUG_PAGEALLOC 403 #ifdef CONFIG_DEBUG_PAGEALLOC
404 unsigned int _debug_guardpage_minorder; 404 unsigned int _debug_guardpage_minorder;
405 405
406 static int __init debug_guardpage_minorder_setup(char *buf) 406 static int __init debug_guardpage_minorder_setup(char *buf)
407 { 407 {
408 unsigned long res; 408 unsigned long res;
409 409
410 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 410 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
411 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 411 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
412 return 0; 412 return 0;
413 } 413 }
414 _debug_guardpage_minorder = res; 414 _debug_guardpage_minorder = res;
415 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 415 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
416 return 0; 416 return 0;
417 } 417 }
418 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 418 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
419 419
420 static inline void set_page_guard_flag(struct page *page) 420 static inline void set_page_guard_flag(struct page *page)
421 { 421 {
422 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 422 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
423 } 423 }
424 424
425 static inline void clear_page_guard_flag(struct page *page) 425 static inline void clear_page_guard_flag(struct page *page)
426 { 426 {
427 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 427 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
428 } 428 }
429 #else 429 #else
430 static inline void set_page_guard_flag(struct page *page) { } 430 static inline void set_page_guard_flag(struct page *page) { }
431 static inline void clear_page_guard_flag(struct page *page) { } 431 static inline void clear_page_guard_flag(struct page *page) { }
432 #endif 432 #endif
433 433
434 static inline void set_page_order(struct page *page, int order) 434 static inline void set_page_order(struct page *page, int order)
435 { 435 {
436 set_page_private(page, order); 436 set_page_private(page, order);
437 __SetPageBuddy(page); 437 __SetPageBuddy(page);
438 } 438 }
439 439
440 static inline void rmv_page_order(struct page *page) 440 static inline void rmv_page_order(struct page *page)
441 { 441 {
442 __ClearPageBuddy(page); 442 __ClearPageBuddy(page);
443 set_page_private(page, 0); 443 set_page_private(page, 0);
444 } 444 }
445 445
446 /* 446 /*
447 * Locate the struct page for both the matching buddy in our 447 * Locate the struct page for both the matching buddy in our
448 * pair (buddy1) and the combined O(n+1) page they form (page). 448 * pair (buddy1) and the combined O(n+1) page they form (page).
449 * 449 *
450 * 1) Any buddy B1 will have an order O twin B2 which satisfies 450 * 1) Any buddy B1 will have an order O twin B2 which satisfies
451 * the following equation: 451 * the following equation:
452 * B2 = B1 ^ (1 << O) 452 * B2 = B1 ^ (1 << O)
453 * For example, if the starting buddy (buddy2) is #8 its order 453 * For example, if the starting buddy (buddy2) is #8 its order
454 * 1 buddy is #10: 454 * 1 buddy is #10:
455 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 455 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
456 * 456 *
457 * 2) Any buddy B will have an order O+1 parent P which 457 * 2) Any buddy B will have an order O+1 parent P which
458 * satisfies the following equation: 458 * satisfies the following equation:
459 * P = B & ~(1 << O) 459 * P = B & ~(1 << O)
460 * 460 *
461 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 461 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
462 */ 462 */
463 static inline unsigned long 463 static inline unsigned long
464 __find_buddy_index(unsigned long page_idx, unsigned int order) 464 __find_buddy_index(unsigned long page_idx, unsigned int order)
465 { 465 {
466 return page_idx ^ (1 << order); 466 return page_idx ^ (1 << order);
467 } 467 }
468 468
469 /* 469 /*
470 * This function checks whether a page is free && is the buddy 470 * This function checks whether a page is free && is the buddy
471 * we can do coalesce a page and its buddy if 471 * we can do coalesce a page and its buddy if
472 * (a) the buddy is not in a hole && 472 * (a) the buddy is not in a hole &&
473 * (b) the buddy is in the buddy system && 473 * (b) the buddy is in the buddy system &&
474 * (c) a page and its buddy have the same order && 474 * (c) a page and its buddy have the same order &&
475 * (d) a page and its buddy are in the same zone. 475 * (d) a page and its buddy are in the same zone.
476 * 476 *
477 * For recording whether a page is in the buddy system, we set ->_mapcount -2. 477 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
478 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. 478 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
479 * 479 *
480 * For recording page's order, we use page_private(page). 480 * For recording page's order, we use page_private(page).
481 */ 481 */
482 static inline int page_is_buddy(struct page *page, struct page *buddy, 482 static inline int page_is_buddy(struct page *page, struct page *buddy,
483 int order) 483 int order)
484 { 484 {
485 if (!pfn_valid_within(page_to_pfn(buddy))) 485 if (!pfn_valid_within(page_to_pfn(buddy)))
486 return 0; 486 return 0;
487 487
488 if (page_zone_id(page) != page_zone_id(buddy)) 488 if (page_zone_id(page) != page_zone_id(buddy))
489 return 0; 489 return 0;
490 490
491 if (page_is_guard(buddy) && page_order(buddy) == order) { 491 if (page_is_guard(buddy) && page_order(buddy) == order) {
492 VM_BUG_ON(page_count(buddy) != 0); 492 VM_BUG_ON(page_count(buddy) != 0);
493 return 1; 493 return 1;
494 } 494 }
495 495
496 if (PageBuddy(buddy) && page_order(buddy) == order) { 496 if (PageBuddy(buddy) && page_order(buddy) == order) {
497 VM_BUG_ON(page_count(buddy) != 0); 497 VM_BUG_ON(page_count(buddy) != 0);
498 return 1; 498 return 1;
499 } 499 }
500 return 0; 500 return 0;
501 } 501 }
502 502
503 /* 503 /*
504 * Freeing function for a buddy system allocator. 504 * Freeing function for a buddy system allocator.
505 * 505 *
506 * The concept of a buddy system is to maintain direct-mapped table 506 * The concept of a buddy system is to maintain direct-mapped table
507 * (containing bit values) for memory blocks of various "orders". 507 * (containing bit values) for memory blocks of various "orders".
508 * The bottom level table contains the map for the smallest allocatable 508 * The bottom level table contains the map for the smallest allocatable
509 * units of memory (here, pages), and each level above it describes 509 * units of memory (here, pages), and each level above it describes
510 * pairs of units from the levels below, hence, "buddies". 510 * pairs of units from the levels below, hence, "buddies".
511 * At a high level, all that happens here is marking the table entry 511 * At a high level, all that happens here is marking the table entry
512 * at the bottom level available, and propagating the changes upward 512 * at the bottom level available, and propagating the changes upward
513 * as necessary, plus some accounting needed to play nicely with other 513 * as necessary, plus some accounting needed to play nicely with other
514 * parts of the VM system. 514 * parts of the VM system.
515 * At each level, we keep a list of pages, which are heads of continuous 515 * At each level, we keep a list of pages, which are heads of continuous
516 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 516 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
517 * order is recorded in page_private(page) field. 517 * order is recorded in page_private(page) field.
518 * So when we are allocating or freeing one, we can derive the state of the 518 * So when we are allocating or freeing one, we can derive the state of the
519 * other. That is, if we allocate a small block, and both were 519 * other. That is, if we allocate a small block, and both were
520 * free, the remainder of the region must be split into blocks. 520 * free, the remainder of the region must be split into blocks.
521 * If a block is freed, and its buddy is also free, then this 521 * If a block is freed, and its buddy is also free, then this
522 * triggers coalescing into a block of larger size. 522 * triggers coalescing into a block of larger size.
523 * 523 *
524 * -- nyc 524 * -- nyc
525 */ 525 */
526 526
527 static inline void __free_one_page(struct page *page, 527 static inline void __free_one_page(struct page *page,
528 struct zone *zone, unsigned int order, 528 struct zone *zone, unsigned int order,
529 int migratetype) 529 int migratetype)
530 { 530 {
531 unsigned long page_idx; 531 unsigned long page_idx;
532 unsigned long combined_idx; 532 unsigned long combined_idx;
533 unsigned long uninitialized_var(buddy_idx); 533 unsigned long uninitialized_var(buddy_idx);
534 struct page *buddy; 534 struct page *buddy;
535 535
536 if (unlikely(PageCompound(page))) 536 if (unlikely(PageCompound(page)))
537 if (unlikely(destroy_compound_page(page, order))) 537 if (unlikely(destroy_compound_page(page, order)))
538 return; 538 return;
539 539
540 VM_BUG_ON(migratetype == -1); 540 VM_BUG_ON(migratetype == -1);
541 541
542 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 542 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
543 543
544 VM_BUG_ON(page_idx & ((1 << order) - 1)); 544 VM_BUG_ON(page_idx & ((1 << order) - 1));
545 VM_BUG_ON(bad_range(zone, page)); 545 VM_BUG_ON(bad_range(zone, page));
546 546
547 while (order < MAX_ORDER-1) { 547 while (order < MAX_ORDER-1) {
548 buddy_idx = __find_buddy_index(page_idx, order); 548 buddy_idx = __find_buddy_index(page_idx, order);
549 buddy = page + (buddy_idx - page_idx); 549 buddy = page + (buddy_idx - page_idx);
550 if (!page_is_buddy(page, buddy, order)) 550 if (!page_is_buddy(page, buddy, order))
551 break; 551 break;
552 /* 552 /*
553 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 553 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
554 * merge with it and move up one order. 554 * merge with it and move up one order.
555 */ 555 */
556 if (page_is_guard(buddy)) { 556 if (page_is_guard(buddy)) {
557 clear_page_guard_flag(buddy); 557 clear_page_guard_flag(buddy);
558 set_page_private(page, 0); 558 set_page_private(page, 0);
559 __mod_zone_freepage_state(zone, 1 << order, 559 __mod_zone_freepage_state(zone, 1 << order,
560 migratetype); 560 migratetype);
561 } else { 561 } else {
562 list_del(&buddy->lru); 562 list_del(&buddy->lru);
563 zone->free_area[order].nr_free--; 563 zone->free_area[order].nr_free--;
564 rmv_page_order(buddy); 564 rmv_page_order(buddy);
565 } 565 }
566 combined_idx = buddy_idx & page_idx; 566 combined_idx = buddy_idx & page_idx;
567 page = page + (combined_idx - page_idx); 567 page = page + (combined_idx - page_idx);
568 page_idx = combined_idx; 568 page_idx = combined_idx;
569 order++; 569 order++;
570 } 570 }
571 set_page_order(page, order); 571 set_page_order(page, order);
572 572
573 /* 573 /*
574 * If this is not the largest possible page, check if the buddy 574 * If this is not the largest possible page, check if the buddy
575 * of the next-highest order is free. If it is, it's possible 575 * of the next-highest order is free. If it is, it's possible
576 * that pages are being freed that will coalesce soon. In case, 576 * that pages are being freed that will coalesce soon. In case,
577 * that is happening, add the free page to the tail of the list 577 * that is happening, add the free page to the tail of the list
578 * so it's less likely to be used soon and more likely to be merged 578 * so it's less likely to be used soon and more likely to be merged
579 * as a higher order page 579 * as a higher order page
580 */ 580 */
581 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 581 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
582 struct page *higher_page, *higher_buddy; 582 struct page *higher_page, *higher_buddy;
583 combined_idx = buddy_idx & page_idx; 583 combined_idx = buddy_idx & page_idx;
584 higher_page = page + (combined_idx - page_idx); 584 higher_page = page + (combined_idx - page_idx);
585 buddy_idx = __find_buddy_index(combined_idx, order + 1); 585 buddy_idx = __find_buddy_index(combined_idx, order + 1);
586 higher_buddy = higher_page + (buddy_idx - combined_idx); 586 higher_buddy = higher_page + (buddy_idx - combined_idx);
587 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 587 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
588 list_add_tail(&page->lru, 588 list_add_tail(&page->lru,
589 &zone->free_area[order].free_list[migratetype]); 589 &zone->free_area[order].free_list[migratetype]);
590 goto out; 590 goto out;
591 } 591 }
592 } 592 }
593 593
594 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 594 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
595 out: 595 out:
596 zone->free_area[order].nr_free++; 596 zone->free_area[order].nr_free++;
597 } 597 }
598 598
599 static inline int free_pages_check(struct page *page) 599 static inline int free_pages_check(struct page *page)
600 { 600 {
601 if (unlikely(page_mapcount(page) | 601 if (unlikely(page_mapcount(page) |
602 (page->mapping != NULL) | 602 (page->mapping != NULL) |
603 (atomic_read(&page->_count) != 0) | 603 (atomic_read(&page->_count) != 0) |
604 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 604 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
605 (mem_cgroup_bad_page_check(page)))) { 605 (mem_cgroup_bad_page_check(page)))) {
606 bad_page(page); 606 bad_page(page);
607 return 1; 607 return 1;
608 } 608 }
609 reset_page_last_nid(page); 609 reset_page_last_nid(page);
610 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 610 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
611 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 611 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
612 return 0; 612 return 0;
613 } 613 }
614 614
615 /* 615 /*
616 * Frees a number of pages from the PCP lists 616 * Frees a number of pages from the PCP lists
617 * Assumes all pages on list are in same zone, and of same order. 617 * Assumes all pages on list are in same zone, and of same order.
618 * count is the number of pages to free. 618 * count is the number of pages to free.
619 * 619 *
620 * If the zone was previously in an "all pages pinned" state then look to 620 * If the zone was previously in an "all pages pinned" state then look to
621 * see if this freeing clears that state. 621 * see if this freeing clears that state.
622 * 622 *
623 * And clear the zone's pages_scanned counter, to hold off the "all pages are 623 * And clear the zone's pages_scanned counter, to hold off the "all pages are
624 * pinned" detection logic. 624 * pinned" detection logic.
625 */ 625 */
626 static void free_pcppages_bulk(struct zone *zone, int count, 626 static void free_pcppages_bulk(struct zone *zone, int count,
627 struct per_cpu_pages *pcp) 627 struct per_cpu_pages *pcp)
628 { 628 {
629 int migratetype = 0; 629 int migratetype = 0;
630 int batch_free = 0; 630 int batch_free = 0;
631 int to_free = count; 631 int to_free = count;
632 632
633 spin_lock(&zone->lock); 633 spin_lock(&zone->lock);
634 zone->all_unreclaimable = 0; 634 zone->all_unreclaimable = 0;
635 zone->pages_scanned = 0; 635 zone->pages_scanned = 0;
636 636
637 while (to_free) { 637 while (to_free) {
638 struct page *page; 638 struct page *page;
639 struct list_head *list; 639 struct list_head *list;
640 640
641 /* 641 /*
642 * Remove pages from lists in a round-robin fashion. A 642 * Remove pages from lists in a round-robin fashion. A
643 * batch_free count is maintained that is incremented when an 643 * batch_free count is maintained that is incremented when an
644 * empty list is encountered. This is so more pages are freed 644 * empty list is encountered. This is so more pages are freed
645 * off fuller lists instead of spinning excessively around empty 645 * off fuller lists instead of spinning excessively around empty
646 * lists 646 * lists
647 */ 647 */
648 do { 648 do {
649 batch_free++; 649 batch_free++;
650 if (++migratetype == MIGRATE_PCPTYPES) 650 if (++migratetype == MIGRATE_PCPTYPES)
651 migratetype = 0; 651 migratetype = 0;
652 list = &pcp->lists[migratetype]; 652 list = &pcp->lists[migratetype];
653 } while (list_empty(list)); 653 } while (list_empty(list));
654 654
655 /* This is the only non-empty list. Free them all. */ 655 /* This is the only non-empty list. Free them all. */
656 if (batch_free == MIGRATE_PCPTYPES) 656 if (batch_free == MIGRATE_PCPTYPES)
657 batch_free = to_free; 657 batch_free = to_free;
658 658
659 do { 659 do {
660 int mt; /* migratetype of the to-be-freed page */ 660 int mt; /* migratetype of the to-be-freed page */
661 661
662 page = list_entry(list->prev, struct page, lru); 662 page = list_entry(list->prev, struct page, lru);
663 /* must delete as __free_one_page list manipulates */ 663 /* must delete as __free_one_page list manipulates */
664 list_del(&page->lru); 664 list_del(&page->lru);
665 mt = get_freepage_migratetype(page); 665 mt = get_freepage_migratetype(page);
666 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 666 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
667 __free_one_page(page, zone, 0, mt); 667 __free_one_page(page, zone, 0, mt);
668 trace_mm_page_pcpu_drain(page, 0, mt); 668 trace_mm_page_pcpu_drain(page, 0, mt);
669 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { 669 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
670 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 670 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
671 if (is_migrate_cma(mt)) 671 if (is_migrate_cma(mt))
672 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 672 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
673 } 673 }
674 } while (--to_free && --batch_free && !list_empty(list)); 674 } while (--to_free && --batch_free && !list_empty(list));
675 } 675 }
676 spin_unlock(&zone->lock); 676 spin_unlock(&zone->lock);
677 } 677 }
678 678
679 static void free_one_page(struct zone *zone, struct page *page, int order, 679 static void free_one_page(struct zone *zone, struct page *page, int order,
680 int migratetype) 680 int migratetype)
681 { 681 {
682 spin_lock(&zone->lock); 682 spin_lock(&zone->lock);
683 zone->all_unreclaimable = 0; 683 zone->all_unreclaimable = 0;
684 zone->pages_scanned = 0; 684 zone->pages_scanned = 0;
685 685
686 __free_one_page(page, zone, order, migratetype); 686 __free_one_page(page, zone, order, migratetype);
687 if (unlikely(migratetype != MIGRATE_ISOLATE)) 687 if (unlikely(migratetype != MIGRATE_ISOLATE))
688 __mod_zone_freepage_state(zone, 1 << order, migratetype); 688 __mod_zone_freepage_state(zone, 1 << order, migratetype);
689 spin_unlock(&zone->lock); 689 spin_unlock(&zone->lock);
690 } 690 }
691 691
692 static bool free_pages_prepare(struct page *page, unsigned int order) 692 static bool free_pages_prepare(struct page *page, unsigned int order)
693 { 693 {
694 int i; 694 int i;
695 int bad = 0; 695 int bad = 0;
696 696
697 trace_mm_page_free(page, order); 697 trace_mm_page_free(page, order);
698 kmemcheck_free_shadow(page, order); 698 kmemcheck_free_shadow(page, order);
699 699
700 if (PageAnon(page)) 700 if (PageAnon(page))
701 page->mapping = NULL; 701 page->mapping = NULL;
702 for (i = 0; i < (1 << order); i++) 702 for (i = 0; i < (1 << order); i++)
703 bad += free_pages_check(page + i); 703 bad += free_pages_check(page + i);
704 if (bad) 704 if (bad)
705 return false; 705 return false;
706 706
707 if (!PageHighMem(page)) { 707 if (!PageHighMem(page)) {
708 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 708 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
709 debug_check_no_obj_freed(page_address(page), 709 debug_check_no_obj_freed(page_address(page),
710 PAGE_SIZE << order); 710 PAGE_SIZE << order);
711 } 711 }
712 arch_free_page(page, order); 712 arch_free_page(page, order);
713 kernel_map_pages(page, 1 << order, 0); 713 kernel_map_pages(page, 1 << order, 0);
714 714
715 return true; 715 return true;
716 } 716 }
717 717
718 static void __free_pages_ok(struct page *page, unsigned int order) 718 static void __free_pages_ok(struct page *page, unsigned int order)
719 { 719 {
720 unsigned long flags; 720 unsigned long flags;
721 int migratetype; 721 int migratetype;
722 722
723 if (!free_pages_prepare(page, order)) 723 if (!free_pages_prepare(page, order))
724 return; 724 return;
725 725
726 local_irq_save(flags); 726 local_irq_save(flags);
727 __count_vm_events(PGFREE, 1 << order); 727 __count_vm_events(PGFREE, 1 << order);
728 migratetype = get_pageblock_migratetype(page); 728 migratetype = get_pageblock_migratetype(page);
729 set_freepage_migratetype(page, migratetype); 729 set_freepage_migratetype(page, migratetype);
730 free_one_page(page_zone(page), page, order, migratetype); 730 free_one_page(page_zone(page), page, order, migratetype);
731 local_irq_restore(flags); 731 local_irq_restore(flags);
732 } 732 }
733 733
734 /* 734 /*
735 * Read access to zone->managed_pages is safe because it's unsigned long, 735 * Read access to zone->managed_pages is safe because it's unsigned long,
736 * but we still need to serialize writers. Currently all callers of 736 * but we still need to serialize writers. Currently all callers of
737 * __free_pages_bootmem() except put_page_bootmem() should only be used 737 * __free_pages_bootmem() except put_page_bootmem() should only be used
738 * at boot time. So for shorter boot time, we shift the burden to 738 * at boot time. So for shorter boot time, we shift the burden to
739 * put_page_bootmem() to serialize writers. 739 * put_page_bootmem() to serialize writers.
740 */ 740 */
741 void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 741 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
742 { 742 {
743 unsigned int nr_pages = 1 << order; 743 unsigned int nr_pages = 1 << order;
744 unsigned int loop; 744 unsigned int loop;
745 745
746 prefetchw(page); 746 prefetchw(page);
747 for (loop = 0; loop < nr_pages; loop++) { 747 for (loop = 0; loop < nr_pages; loop++) {
748 struct page *p = &page[loop]; 748 struct page *p = &page[loop];
749 749
750 if (loop + 1 < nr_pages) 750 if (loop + 1 < nr_pages)
751 prefetchw(p + 1); 751 prefetchw(p + 1);
752 __ClearPageReserved(p); 752 __ClearPageReserved(p);
753 set_page_count(p, 0); 753 set_page_count(p, 0);
754 } 754 }
755 755
756 page_zone(page)->managed_pages += 1 << order; 756 page_zone(page)->managed_pages += 1 << order;
757 set_page_refcounted(page); 757 set_page_refcounted(page);
758 __free_pages(page, order); 758 __free_pages(page, order);
759 } 759 }
760 760
761 #ifdef CONFIG_CMA 761 #ifdef CONFIG_CMA
762 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ 762 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
763 void __init init_cma_reserved_pageblock(struct page *page) 763 void __init init_cma_reserved_pageblock(struct page *page)
764 { 764 {
765 unsigned i = pageblock_nr_pages; 765 unsigned i = pageblock_nr_pages;
766 struct page *p = page; 766 struct page *p = page;
767 767
768 do { 768 do {
769 __ClearPageReserved(p); 769 __ClearPageReserved(p);
770 set_page_count(p, 0); 770 set_page_count(p, 0);
771 } while (++p, --i); 771 } while (++p, --i);
772 772
773 set_page_refcounted(page); 773 set_page_refcounted(page);
774 set_pageblock_migratetype(page, MIGRATE_CMA); 774 set_pageblock_migratetype(page, MIGRATE_CMA);
775 __free_pages(page, pageblock_order); 775 __free_pages(page, pageblock_order);
776 totalram_pages += pageblock_nr_pages; 776 totalram_pages += pageblock_nr_pages;
777 #ifdef CONFIG_HIGHMEM 777 #ifdef CONFIG_HIGHMEM
778 if (PageHighMem(page)) 778 if (PageHighMem(page))
779 totalhigh_pages += pageblock_nr_pages; 779 totalhigh_pages += pageblock_nr_pages;
780 #endif 780 #endif
781 } 781 }
782 #endif 782 #endif
783 783
784 /* 784 /*
785 * The order of subdivision here is critical for the IO subsystem. 785 * The order of subdivision here is critical for the IO subsystem.
786 * Please do not alter this order without good reasons and regression 786 * Please do not alter this order without good reasons and regression
787 * testing. Specifically, as large blocks of memory are subdivided, 787 * testing. Specifically, as large blocks of memory are subdivided,
788 * the order in which smaller blocks are delivered depends on the order 788 * the order in which smaller blocks are delivered depends on the order
789 * they're subdivided in this function. This is the primary factor 789 * they're subdivided in this function. This is the primary factor
790 * influencing the order in which pages are delivered to the IO 790 * influencing the order in which pages are delivered to the IO
791 * subsystem according to empirical testing, and this is also justified 791 * subsystem according to empirical testing, and this is also justified
792 * by considering the behavior of a buddy system containing a single 792 * by considering the behavior of a buddy system containing a single
793 * large block of memory acted on by a series of small allocations. 793 * large block of memory acted on by a series of small allocations.
794 * This behavior is a critical factor in sglist merging's success. 794 * This behavior is a critical factor in sglist merging's success.
795 * 795 *
796 * -- nyc 796 * -- nyc
797 */ 797 */
798 static inline void expand(struct zone *zone, struct page *page, 798 static inline void expand(struct zone *zone, struct page *page,
799 int low, int high, struct free_area *area, 799 int low, int high, struct free_area *area,
800 int migratetype) 800 int migratetype)
801 { 801 {
802 unsigned long size = 1 << high; 802 unsigned long size = 1 << high;
803 803
804 while (high > low) { 804 while (high > low) {
805 area--; 805 area--;
806 high--; 806 high--;
807 size >>= 1; 807 size >>= 1;
808 VM_BUG_ON(bad_range(zone, &page[size])); 808 VM_BUG_ON(bad_range(zone, &page[size]));
809 809
810 #ifdef CONFIG_DEBUG_PAGEALLOC 810 #ifdef CONFIG_DEBUG_PAGEALLOC
811 if (high < debug_guardpage_minorder()) { 811 if (high < debug_guardpage_minorder()) {
812 /* 812 /*
813 * Mark as guard pages (or page), that will allow to 813 * Mark as guard pages (or page), that will allow to
814 * merge back to allocator when buddy will be freed. 814 * merge back to allocator when buddy will be freed.
815 * Corresponding page table entries will not be touched, 815 * Corresponding page table entries will not be touched,
816 * pages will stay not present in virtual address space 816 * pages will stay not present in virtual address space
817 */ 817 */
818 INIT_LIST_HEAD(&page[size].lru); 818 INIT_LIST_HEAD(&page[size].lru);
819 set_page_guard_flag(&page[size]); 819 set_page_guard_flag(&page[size]);
820 set_page_private(&page[size], high); 820 set_page_private(&page[size], high);
821 /* Guard pages are not available for any usage */ 821 /* Guard pages are not available for any usage */
822 __mod_zone_freepage_state(zone, -(1 << high), 822 __mod_zone_freepage_state(zone, -(1 << high),
823 migratetype); 823 migratetype);
824 continue; 824 continue;
825 } 825 }
826 #endif 826 #endif
827 list_add(&page[size].lru, &area->free_list[migratetype]); 827 list_add(&page[size].lru, &area->free_list[migratetype]);
828 area->nr_free++; 828 area->nr_free++;
829 set_page_order(&page[size], high); 829 set_page_order(&page[size], high);
830 } 830 }
831 } 831 }
832 832
833 /* 833 /*
834 * This page is about to be returned from the page allocator 834 * This page is about to be returned from the page allocator
835 */ 835 */
836 static inline int check_new_page(struct page *page) 836 static inline int check_new_page(struct page *page)
837 { 837 {
838 if (unlikely(page_mapcount(page) | 838 if (unlikely(page_mapcount(page) |
839 (page->mapping != NULL) | 839 (page->mapping != NULL) |
840 (atomic_read(&page->_count) != 0) | 840 (atomic_read(&page->_count) != 0) |
841 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 841 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
842 (mem_cgroup_bad_page_check(page)))) { 842 (mem_cgroup_bad_page_check(page)))) {
843 bad_page(page); 843 bad_page(page);
844 return 1; 844 return 1;
845 } 845 }
846 return 0; 846 return 0;
847 } 847 }
848 848
849 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 849 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
850 { 850 {
851 int i; 851 int i;
852 852
853 for (i = 0; i < (1 << order); i++) { 853 for (i = 0; i < (1 << order); i++) {
854 struct page *p = page + i; 854 struct page *p = page + i;
855 if (unlikely(check_new_page(p))) 855 if (unlikely(check_new_page(p)))
856 return 1; 856 return 1;
857 } 857 }
858 858
859 set_page_private(page, 0); 859 set_page_private(page, 0);
860 set_page_refcounted(page); 860 set_page_refcounted(page);
861 861
862 arch_alloc_page(page, order); 862 arch_alloc_page(page, order);
863 kernel_map_pages(page, 1 << order, 1); 863 kernel_map_pages(page, 1 << order, 1);
864 864
865 if (gfp_flags & __GFP_ZERO) 865 if (gfp_flags & __GFP_ZERO)
866 prep_zero_page(page, order, gfp_flags); 866 prep_zero_page(page, order, gfp_flags);
867 867
868 if (order && (gfp_flags & __GFP_COMP)) 868 if (order && (gfp_flags & __GFP_COMP))
869 prep_compound_page(page, order); 869 prep_compound_page(page, order);
870 870
871 return 0; 871 return 0;
872 } 872 }
873 873
874 /* 874 /*
875 * Go through the free lists for the given migratetype and remove 875 * Go through the free lists for the given migratetype and remove
876 * the smallest available page from the freelists 876 * the smallest available page from the freelists
877 */ 877 */
878 static inline 878 static inline
879 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 879 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
880 int migratetype) 880 int migratetype)
881 { 881 {
882 unsigned int current_order; 882 unsigned int current_order;
883 struct free_area * area; 883 struct free_area * area;
884 struct page *page; 884 struct page *page;
885 885
886 /* Find a page of the appropriate size in the preferred list */ 886 /* Find a page of the appropriate size in the preferred list */
887 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 887 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
888 area = &(zone->free_area[current_order]); 888 area = &(zone->free_area[current_order]);
889 if (list_empty(&area->free_list[migratetype])) 889 if (list_empty(&area->free_list[migratetype]))
890 continue; 890 continue;
891 891
892 page = list_entry(area->free_list[migratetype].next, 892 page = list_entry(area->free_list[migratetype].next,
893 struct page, lru); 893 struct page, lru);
894 list_del(&page->lru); 894 list_del(&page->lru);
895 rmv_page_order(page); 895 rmv_page_order(page);
896 area->nr_free--; 896 area->nr_free--;
897 expand(zone, page, order, current_order, area, migratetype); 897 expand(zone, page, order, current_order, area, migratetype);
898 return page; 898 return page;
899 } 899 }
900 900
901 return NULL; 901 return NULL;
902 } 902 }
903 903
904 904
905 /* 905 /*
906 * This array describes the order lists are fallen back to when 906 * This array describes the order lists are fallen back to when
907 * the free lists for the desirable migrate type are depleted 907 * the free lists for the desirable migrate type are depleted
908 */ 908 */
909 static int fallbacks[MIGRATE_TYPES][4] = { 909 static int fallbacks[MIGRATE_TYPES][4] = {
910 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 910 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
911 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 911 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
912 #ifdef CONFIG_CMA 912 #ifdef CONFIG_CMA
913 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 913 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
914 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 914 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
915 #else 915 #else
916 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 916 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
917 #endif 917 #endif
918 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 918 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
919 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 919 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
920 }; 920 };
921 921
922 /* 922 /*
923 * Move the free pages in a range to the free lists of the requested type. 923 * Move the free pages in a range to the free lists of the requested type.
924 * Note that start_page and end_pages are not aligned on a pageblock 924 * Note that start_page and end_pages are not aligned on a pageblock
925 * boundary. If alignment is required, use move_freepages_block() 925 * boundary. If alignment is required, use move_freepages_block()
926 */ 926 */
927 int move_freepages(struct zone *zone, 927 int move_freepages(struct zone *zone,
928 struct page *start_page, struct page *end_page, 928 struct page *start_page, struct page *end_page,
929 int migratetype) 929 int migratetype)
930 { 930 {
931 struct page *page; 931 struct page *page;
932 unsigned long order; 932 unsigned long order;
933 int pages_moved = 0; 933 int pages_moved = 0;
934 934
935 #ifndef CONFIG_HOLES_IN_ZONE 935 #ifndef CONFIG_HOLES_IN_ZONE
936 /* 936 /*
937 * page_zone is not safe to call in this context when 937 * page_zone is not safe to call in this context when
938 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 938 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
939 * anyway as we check zone boundaries in move_freepages_block(). 939 * anyway as we check zone boundaries in move_freepages_block().
940 * Remove at a later date when no bug reports exist related to 940 * Remove at a later date when no bug reports exist related to
941 * grouping pages by mobility 941 * grouping pages by mobility
942 */ 942 */
943 BUG_ON(page_zone(start_page) != page_zone(end_page)); 943 BUG_ON(page_zone(start_page) != page_zone(end_page));
944 #endif 944 #endif
945 945
946 for (page = start_page; page <= end_page;) { 946 for (page = start_page; page <= end_page;) {
947 /* Make sure we are not inadvertently changing nodes */ 947 /* Make sure we are not inadvertently changing nodes */
948 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 948 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
949 949
950 if (!pfn_valid_within(page_to_pfn(page))) { 950 if (!pfn_valid_within(page_to_pfn(page))) {
951 page++; 951 page++;
952 continue; 952 continue;
953 } 953 }
954 954
955 if (!PageBuddy(page)) { 955 if (!PageBuddy(page)) {
956 page++; 956 page++;
957 continue; 957 continue;
958 } 958 }
959 959
960 order = page_order(page); 960 order = page_order(page);
961 list_move(&page->lru, 961 list_move(&page->lru,
962 &zone->free_area[order].free_list[migratetype]); 962 &zone->free_area[order].free_list[migratetype]);
963 set_freepage_migratetype(page, migratetype); 963 set_freepage_migratetype(page, migratetype);
964 page += 1 << order; 964 page += 1 << order;
965 pages_moved += 1 << order; 965 pages_moved += 1 << order;
966 } 966 }
967 967
968 return pages_moved; 968 return pages_moved;
969 } 969 }
970 970
971 int move_freepages_block(struct zone *zone, struct page *page, 971 int move_freepages_block(struct zone *zone, struct page *page,
972 int migratetype) 972 int migratetype)
973 { 973 {
974 unsigned long start_pfn, end_pfn; 974 unsigned long start_pfn, end_pfn;
975 struct page *start_page, *end_page; 975 struct page *start_page, *end_page;
976 976
977 start_pfn = page_to_pfn(page); 977 start_pfn = page_to_pfn(page);
978 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 978 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
979 start_page = pfn_to_page(start_pfn); 979 start_page = pfn_to_page(start_pfn);
980 end_page = start_page + pageblock_nr_pages - 1; 980 end_page = start_page + pageblock_nr_pages - 1;
981 end_pfn = start_pfn + pageblock_nr_pages - 1; 981 end_pfn = start_pfn + pageblock_nr_pages - 1;
982 982
983 /* Do not cross zone boundaries */ 983 /* Do not cross zone boundaries */
984 if (start_pfn < zone->zone_start_pfn) 984 if (start_pfn < zone->zone_start_pfn)
985 start_page = page; 985 start_page = page;
986 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 986 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
987 return 0; 987 return 0;
988 988
989 return move_freepages(zone, start_page, end_page, migratetype); 989 return move_freepages(zone, start_page, end_page, migratetype);
990 } 990 }
991 991
992 static void change_pageblock_range(struct page *pageblock_page, 992 static void change_pageblock_range(struct page *pageblock_page,
993 int start_order, int migratetype) 993 int start_order, int migratetype)
994 { 994 {
995 int nr_pageblocks = 1 << (start_order - pageblock_order); 995 int nr_pageblocks = 1 << (start_order - pageblock_order);
996 996
997 while (nr_pageblocks--) { 997 while (nr_pageblocks--) {
998 set_pageblock_migratetype(pageblock_page, migratetype); 998 set_pageblock_migratetype(pageblock_page, migratetype);
999 pageblock_page += pageblock_nr_pages; 999 pageblock_page += pageblock_nr_pages;
1000 } 1000 }
1001 } 1001 }
1002 1002
1003 /* Remove an element from the buddy allocator from the fallback list */ 1003 /* Remove an element from the buddy allocator from the fallback list */
1004 static inline struct page * 1004 static inline struct page *
1005 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1005 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1006 { 1006 {
1007 struct free_area * area; 1007 struct free_area * area;
1008 int current_order; 1008 int current_order;
1009 struct page *page; 1009 struct page *page;
1010 int migratetype, i; 1010 int migratetype, i;
1011 1011
1012 /* Find the largest possible block of pages in the other list */ 1012 /* Find the largest possible block of pages in the other list */
1013 for (current_order = MAX_ORDER-1; current_order >= order; 1013 for (current_order = MAX_ORDER-1; current_order >= order;
1014 --current_order) { 1014 --current_order) {
1015 for (i = 0;; i++) { 1015 for (i = 0;; i++) {
1016 migratetype = fallbacks[start_migratetype][i]; 1016 migratetype = fallbacks[start_migratetype][i];
1017 1017
1018 /* MIGRATE_RESERVE handled later if necessary */ 1018 /* MIGRATE_RESERVE handled later if necessary */
1019 if (migratetype == MIGRATE_RESERVE) 1019 if (migratetype == MIGRATE_RESERVE)
1020 break; 1020 break;
1021 1021
1022 area = &(zone->free_area[current_order]); 1022 area = &(zone->free_area[current_order]);
1023 if (list_empty(&area->free_list[migratetype])) 1023 if (list_empty(&area->free_list[migratetype]))
1024 continue; 1024 continue;
1025 1025
1026 page = list_entry(area->free_list[migratetype].next, 1026 page = list_entry(area->free_list[migratetype].next,
1027 struct page, lru); 1027 struct page, lru);
1028 area->nr_free--; 1028 area->nr_free--;
1029 1029
1030 /* 1030 /*
1031 * If breaking a large block of pages, move all free 1031 * If breaking a large block of pages, move all free
1032 * pages to the preferred allocation list. If falling 1032 * pages to the preferred allocation list. If falling
1033 * back for a reclaimable kernel allocation, be more 1033 * back for a reclaimable kernel allocation, be more
1034 * aggressive about taking ownership of free pages 1034 * aggressive about taking ownership of free pages
1035 * 1035 *
1036 * On the other hand, never change migration 1036 * On the other hand, never change migration
1037 * type of MIGRATE_CMA pageblocks nor move CMA 1037 * type of MIGRATE_CMA pageblocks nor move CMA
1038 * pages on different free lists. We don't 1038 * pages on different free lists. We don't
1039 * want unmovable pages to be allocated from 1039 * want unmovable pages to be allocated from
1040 * MIGRATE_CMA areas. 1040 * MIGRATE_CMA areas.
1041 */ 1041 */
1042 if (!is_migrate_cma(migratetype) && 1042 if (!is_migrate_cma(migratetype) &&
1043 (unlikely(current_order >= pageblock_order / 2) || 1043 (unlikely(current_order >= pageblock_order / 2) ||
1044 start_migratetype == MIGRATE_RECLAIMABLE || 1044 start_migratetype == MIGRATE_RECLAIMABLE ||
1045 page_group_by_mobility_disabled)) { 1045 page_group_by_mobility_disabled)) {
1046 int pages; 1046 int pages;
1047 pages = move_freepages_block(zone, page, 1047 pages = move_freepages_block(zone, page,
1048 start_migratetype); 1048 start_migratetype);
1049 1049
1050 /* Claim the whole block if over half of it is free */ 1050 /* Claim the whole block if over half of it is free */
1051 if (pages >= (1 << (pageblock_order-1)) || 1051 if (pages >= (1 << (pageblock_order-1)) ||
1052 page_group_by_mobility_disabled) 1052 page_group_by_mobility_disabled)
1053 set_pageblock_migratetype(page, 1053 set_pageblock_migratetype(page,
1054 start_migratetype); 1054 start_migratetype);
1055 1055
1056 migratetype = start_migratetype; 1056 migratetype = start_migratetype;
1057 } 1057 }
1058 1058
1059 /* Remove the page from the freelists */ 1059 /* Remove the page from the freelists */
1060 list_del(&page->lru); 1060 list_del(&page->lru);
1061 rmv_page_order(page); 1061 rmv_page_order(page);
1062 1062
1063 /* Take ownership for orders >= pageblock_order */ 1063 /* Take ownership for orders >= pageblock_order */
1064 if (current_order >= pageblock_order && 1064 if (current_order >= pageblock_order &&
1065 !is_migrate_cma(migratetype)) 1065 !is_migrate_cma(migratetype))
1066 change_pageblock_range(page, current_order, 1066 change_pageblock_range(page, current_order,
1067 start_migratetype); 1067 start_migratetype);
1068 1068
1069 expand(zone, page, order, current_order, area, 1069 expand(zone, page, order, current_order, area,
1070 is_migrate_cma(migratetype) 1070 is_migrate_cma(migratetype)
1071 ? migratetype : start_migratetype); 1071 ? migratetype : start_migratetype);
1072 1072
1073 trace_mm_page_alloc_extfrag(page, order, current_order, 1073 trace_mm_page_alloc_extfrag(page, order, current_order,
1074 start_migratetype, migratetype); 1074 start_migratetype, migratetype);
1075 1075
1076 return page; 1076 return page;
1077 } 1077 }
1078 } 1078 }
1079 1079
1080 return NULL; 1080 return NULL;
1081 } 1081 }
1082 1082
1083 /* 1083 /*
1084 * Do the hard work of removing an element from the buddy allocator. 1084 * Do the hard work of removing an element from the buddy allocator.
1085 * Call me with the zone->lock already held. 1085 * Call me with the zone->lock already held.
1086 */ 1086 */
1087 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1087 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1088 int migratetype) 1088 int migratetype)
1089 { 1089 {
1090 struct page *page; 1090 struct page *page;
1091 1091
1092 retry_reserve: 1092 retry_reserve:
1093 page = __rmqueue_smallest(zone, order, migratetype); 1093 page = __rmqueue_smallest(zone, order, migratetype);
1094 1094
1095 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1095 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1096 page = __rmqueue_fallback(zone, order, migratetype); 1096 page = __rmqueue_fallback(zone, order, migratetype);
1097 1097
1098 /* 1098 /*
1099 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1099 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1100 * is used because __rmqueue_smallest is an inline function 1100 * is used because __rmqueue_smallest is an inline function
1101 * and we want just one call site 1101 * and we want just one call site
1102 */ 1102 */
1103 if (!page) { 1103 if (!page) {
1104 migratetype = MIGRATE_RESERVE; 1104 migratetype = MIGRATE_RESERVE;
1105 goto retry_reserve; 1105 goto retry_reserve;
1106 } 1106 }
1107 } 1107 }
1108 1108
1109 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1109 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1110 return page; 1110 return page;
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Obtain a specified number of elements from the buddy allocator, all under 1114 * Obtain a specified number of elements from the buddy allocator, all under
1115 * a single hold of the lock, for efficiency. Add them to the supplied list. 1115 * a single hold of the lock, for efficiency. Add them to the supplied list.
1116 * Returns the number of new pages which were placed at *list. 1116 * Returns the number of new pages which were placed at *list.
1117 */ 1117 */
1118 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1118 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1119 unsigned long count, struct list_head *list, 1119 unsigned long count, struct list_head *list,
1120 int migratetype, int cold) 1120 int migratetype, int cold)
1121 { 1121 {
1122 int mt = migratetype, i; 1122 int mt = migratetype, i;
1123 1123
1124 spin_lock(&zone->lock); 1124 spin_lock(&zone->lock);
1125 for (i = 0; i < count; ++i) { 1125 for (i = 0; i < count; ++i) {
1126 struct page *page = __rmqueue(zone, order, migratetype); 1126 struct page *page = __rmqueue(zone, order, migratetype);
1127 if (unlikely(page == NULL)) 1127 if (unlikely(page == NULL))
1128 break; 1128 break;
1129 1129
1130 /* 1130 /*
1131 * Split buddy pages returned by expand() are received here 1131 * Split buddy pages returned by expand() are received here
1132 * in physical page order. The page is added to the callers and 1132 * in physical page order. The page is added to the callers and
1133 * list and the list head then moves forward. From the callers 1133 * list and the list head then moves forward. From the callers
1134 * perspective, the linked list is ordered by page number in 1134 * perspective, the linked list is ordered by page number in
1135 * some conditions. This is useful for IO devices that can 1135 * some conditions. This is useful for IO devices that can
1136 * merge IO requests if the physical pages are ordered 1136 * merge IO requests if the physical pages are ordered
1137 * properly. 1137 * properly.
1138 */ 1138 */
1139 if (likely(cold == 0)) 1139 if (likely(cold == 0))
1140 list_add(&page->lru, list); 1140 list_add(&page->lru, list);
1141 else 1141 else
1142 list_add_tail(&page->lru, list); 1142 list_add_tail(&page->lru, list);
1143 if (IS_ENABLED(CONFIG_CMA)) { 1143 if (IS_ENABLED(CONFIG_CMA)) {
1144 mt = get_pageblock_migratetype(page); 1144 mt = get_pageblock_migratetype(page);
1145 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1145 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1146 mt = migratetype; 1146 mt = migratetype;
1147 } 1147 }
1148 set_freepage_migratetype(page, mt); 1148 set_freepage_migratetype(page, mt);
1149 list = &page->lru; 1149 list = &page->lru;
1150 if (is_migrate_cma(mt)) 1150 if (is_migrate_cma(mt))
1151 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1151 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1152 -(1 << order)); 1152 -(1 << order));
1153 } 1153 }
1154 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1154 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1155 spin_unlock(&zone->lock); 1155 spin_unlock(&zone->lock);
1156 return i; 1156 return i;
1157 } 1157 }
1158 1158
1159 #ifdef CONFIG_NUMA 1159 #ifdef CONFIG_NUMA
1160 /* 1160 /*
1161 * Called from the vmstat counter updater to drain pagesets of this 1161 * Called from the vmstat counter updater to drain pagesets of this
1162 * currently executing processor on remote nodes after they have 1162 * currently executing processor on remote nodes after they have
1163 * expired. 1163 * expired.
1164 * 1164 *
1165 * Note that this function must be called with the thread pinned to 1165 * Note that this function must be called with the thread pinned to
1166 * a single processor. 1166 * a single processor.
1167 */ 1167 */
1168 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1168 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1169 { 1169 {
1170 unsigned long flags; 1170 unsigned long flags;
1171 int to_drain; 1171 int to_drain;
1172 1172
1173 local_irq_save(flags); 1173 local_irq_save(flags);
1174 if (pcp->count >= pcp->batch) 1174 if (pcp->count >= pcp->batch)
1175 to_drain = pcp->batch; 1175 to_drain = pcp->batch;
1176 else 1176 else
1177 to_drain = pcp->count; 1177 to_drain = pcp->count;
1178 if (to_drain > 0) { 1178 if (to_drain > 0) {
1179 free_pcppages_bulk(zone, to_drain, pcp); 1179 free_pcppages_bulk(zone, to_drain, pcp);
1180 pcp->count -= to_drain; 1180 pcp->count -= to_drain;
1181 } 1181 }
1182 local_irq_restore(flags); 1182 local_irq_restore(flags);
1183 } 1183 }
1184 #endif 1184 #endif
1185 1185
1186 /* 1186 /*
1187 * Drain pages of the indicated processor. 1187 * Drain pages of the indicated processor.
1188 * 1188 *
1189 * The processor must either be the current processor and the 1189 * The processor must either be the current processor and the
1190 * thread pinned to the current processor or a processor that 1190 * thread pinned to the current processor or a processor that
1191 * is not online. 1191 * is not online.
1192 */ 1192 */
1193 static void drain_pages(unsigned int cpu) 1193 static void drain_pages(unsigned int cpu)
1194 { 1194 {
1195 unsigned long flags; 1195 unsigned long flags;
1196 struct zone *zone; 1196 struct zone *zone;
1197 1197
1198 for_each_populated_zone(zone) { 1198 for_each_populated_zone(zone) {
1199 struct per_cpu_pageset *pset; 1199 struct per_cpu_pageset *pset;
1200 struct per_cpu_pages *pcp; 1200 struct per_cpu_pages *pcp;
1201 1201
1202 local_irq_save(flags); 1202 local_irq_save(flags);
1203 pset = per_cpu_ptr(zone->pageset, cpu); 1203 pset = per_cpu_ptr(zone->pageset, cpu);
1204 1204
1205 pcp = &pset->pcp; 1205 pcp = &pset->pcp;
1206 if (pcp->count) { 1206 if (pcp->count) {
1207 free_pcppages_bulk(zone, pcp->count, pcp); 1207 free_pcppages_bulk(zone, pcp->count, pcp);
1208 pcp->count = 0; 1208 pcp->count = 0;
1209 } 1209 }
1210 local_irq_restore(flags); 1210 local_irq_restore(flags);
1211 } 1211 }
1212 } 1212 }
1213 1213
1214 /* 1214 /*
1215 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1215 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1216 */ 1216 */
1217 void drain_local_pages(void *arg) 1217 void drain_local_pages(void *arg)
1218 { 1218 {
1219 drain_pages(smp_processor_id()); 1219 drain_pages(smp_processor_id());
1220 } 1220 }
1221 1221
1222 /* 1222 /*
1223 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1223 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1224 * 1224 *
1225 * Note that this code is protected against sending an IPI to an offline 1225 * Note that this code is protected against sending an IPI to an offline
1226 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1226 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1227 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1227 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1228 * nothing keeps CPUs from showing up after we populated the cpumask and 1228 * nothing keeps CPUs from showing up after we populated the cpumask and
1229 * before the call to on_each_cpu_mask(). 1229 * before the call to on_each_cpu_mask().
1230 */ 1230 */
1231 void drain_all_pages(void) 1231 void drain_all_pages(void)
1232 { 1232 {
1233 int cpu; 1233 int cpu;
1234 struct per_cpu_pageset *pcp; 1234 struct per_cpu_pageset *pcp;
1235 struct zone *zone; 1235 struct zone *zone;
1236 1236
1237 /* 1237 /*
1238 * Allocate in the BSS so we wont require allocation in 1238 * Allocate in the BSS so we wont require allocation in
1239 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1239 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1240 */ 1240 */
1241 static cpumask_t cpus_with_pcps; 1241 static cpumask_t cpus_with_pcps;
1242 1242
1243 /* 1243 /*
1244 * We don't care about racing with CPU hotplug event 1244 * We don't care about racing with CPU hotplug event
1245 * as offline notification will cause the notified 1245 * as offline notification will cause the notified
1246 * cpu to drain that CPU pcps and on_each_cpu_mask 1246 * cpu to drain that CPU pcps and on_each_cpu_mask
1247 * disables preemption as part of its processing 1247 * disables preemption as part of its processing
1248 */ 1248 */
1249 for_each_online_cpu(cpu) { 1249 for_each_online_cpu(cpu) {
1250 bool has_pcps = false; 1250 bool has_pcps = false;
1251 for_each_populated_zone(zone) { 1251 for_each_populated_zone(zone) {
1252 pcp = per_cpu_ptr(zone->pageset, cpu); 1252 pcp = per_cpu_ptr(zone->pageset, cpu);
1253 if (pcp->pcp.count) { 1253 if (pcp->pcp.count) {
1254 has_pcps = true; 1254 has_pcps = true;
1255 break; 1255 break;
1256 } 1256 }
1257 } 1257 }
1258 if (has_pcps) 1258 if (has_pcps)
1259 cpumask_set_cpu(cpu, &cpus_with_pcps); 1259 cpumask_set_cpu(cpu, &cpus_with_pcps);
1260 else 1260 else
1261 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1261 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1262 } 1262 }
1263 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1263 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1264 } 1264 }
1265 1265
1266 #ifdef CONFIG_HIBERNATION 1266 #ifdef CONFIG_HIBERNATION
1267 1267
1268 void mark_free_pages(struct zone *zone) 1268 void mark_free_pages(struct zone *zone)
1269 { 1269 {
1270 unsigned long pfn, max_zone_pfn; 1270 unsigned long pfn, max_zone_pfn;
1271 unsigned long flags; 1271 unsigned long flags;
1272 int order, t; 1272 int order, t;
1273 struct list_head *curr; 1273 struct list_head *curr;
1274 1274
1275 if (!zone->spanned_pages) 1275 if (!zone->spanned_pages)
1276 return; 1276 return;
1277 1277
1278 spin_lock_irqsave(&zone->lock, flags); 1278 spin_lock_irqsave(&zone->lock, flags);
1279 1279
1280 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1280 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1281 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1281 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1282 if (pfn_valid(pfn)) { 1282 if (pfn_valid(pfn)) {
1283 struct page *page = pfn_to_page(pfn); 1283 struct page *page = pfn_to_page(pfn);
1284 1284
1285 if (!swsusp_page_is_forbidden(page)) 1285 if (!swsusp_page_is_forbidden(page))
1286 swsusp_unset_page_free(page); 1286 swsusp_unset_page_free(page);
1287 } 1287 }
1288 1288
1289 for_each_migratetype_order(order, t) { 1289 for_each_migratetype_order(order, t) {
1290 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1290 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1291 unsigned long i; 1291 unsigned long i;
1292 1292
1293 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1293 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1294 for (i = 0; i < (1UL << order); i++) 1294 for (i = 0; i < (1UL << order); i++)
1295 swsusp_set_page_free(pfn_to_page(pfn + i)); 1295 swsusp_set_page_free(pfn_to_page(pfn + i));
1296 } 1296 }
1297 } 1297 }
1298 spin_unlock_irqrestore(&zone->lock, flags); 1298 spin_unlock_irqrestore(&zone->lock, flags);
1299 } 1299 }
1300 #endif /* CONFIG_PM */ 1300 #endif /* CONFIG_PM */
1301 1301
1302 /* 1302 /*
1303 * Free a 0-order page 1303 * Free a 0-order page
1304 * cold == 1 ? free a cold page : free a hot page 1304 * cold == 1 ? free a cold page : free a hot page
1305 */ 1305 */
1306 void free_hot_cold_page(struct page *page, int cold) 1306 void free_hot_cold_page(struct page *page, int cold)
1307 { 1307 {
1308 struct zone *zone = page_zone(page); 1308 struct zone *zone = page_zone(page);
1309 struct per_cpu_pages *pcp; 1309 struct per_cpu_pages *pcp;
1310 unsigned long flags; 1310 unsigned long flags;
1311 int migratetype; 1311 int migratetype;
1312 1312
1313 if (!free_pages_prepare(page, 0)) 1313 if (!free_pages_prepare(page, 0))
1314 return; 1314 return;
1315 1315
1316 migratetype = get_pageblock_migratetype(page); 1316 migratetype = get_pageblock_migratetype(page);
1317 set_freepage_migratetype(page, migratetype); 1317 set_freepage_migratetype(page, migratetype);
1318 local_irq_save(flags); 1318 local_irq_save(flags);
1319 __count_vm_event(PGFREE); 1319 __count_vm_event(PGFREE);
1320 1320
1321 /* 1321 /*
1322 * We only track unmovable, reclaimable and movable on pcp lists. 1322 * We only track unmovable, reclaimable and movable on pcp lists.
1323 * Free ISOLATE pages back to the allocator because they are being 1323 * Free ISOLATE pages back to the allocator because they are being
1324 * offlined but treat RESERVE as movable pages so we can get those 1324 * offlined but treat RESERVE as movable pages so we can get those
1325 * areas back if necessary. Otherwise, we may have to free 1325 * areas back if necessary. Otherwise, we may have to free
1326 * excessively into the page allocator 1326 * excessively into the page allocator
1327 */ 1327 */
1328 if (migratetype >= MIGRATE_PCPTYPES) { 1328 if (migratetype >= MIGRATE_PCPTYPES) {
1329 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1329 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1330 free_one_page(zone, page, 0, migratetype); 1330 free_one_page(zone, page, 0, migratetype);
1331 goto out; 1331 goto out;
1332 } 1332 }
1333 migratetype = MIGRATE_MOVABLE; 1333 migratetype = MIGRATE_MOVABLE;
1334 } 1334 }
1335 1335
1336 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1336 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1337 if (cold) 1337 if (cold)
1338 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1338 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1339 else 1339 else
1340 list_add(&page->lru, &pcp->lists[migratetype]); 1340 list_add(&page->lru, &pcp->lists[migratetype]);
1341 pcp->count++; 1341 pcp->count++;
1342 if (pcp->count >= pcp->high) { 1342 if (pcp->count >= pcp->high) {
1343 free_pcppages_bulk(zone, pcp->batch, pcp); 1343 free_pcppages_bulk(zone, pcp->batch, pcp);
1344 pcp->count -= pcp->batch; 1344 pcp->count -= pcp->batch;
1345 } 1345 }
1346 1346
1347 out: 1347 out:
1348 local_irq_restore(flags); 1348 local_irq_restore(flags);
1349 } 1349 }
1350 1350
1351 /* 1351 /*
1352 * Free a list of 0-order pages 1352 * Free a list of 0-order pages
1353 */ 1353 */
1354 void free_hot_cold_page_list(struct list_head *list, int cold) 1354 void free_hot_cold_page_list(struct list_head *list, int cold)
1355 { 1355 {
1356 struct page *page, *next; 1356 struct page *page, *next;
1357 1357
1358 list_for_each_entry_safe(page, next, list, lru) { 1358 list_for_each_entry_safe(page, next, list, lru) {
1359 trace_mm_page_free_batched(page, cold); 1359 trace_mm_page_free_batched(page, cold);
1360 free_hot_cold_page(page, cold); 1360 free_hot_cold_page(page, cold);
1361 } 1361 }
1362 } 1362 }
1363 1363
1364 /* 1364 /*
1365 * split_page takes a non-compound higher-order page, and splits it into 1365 * split_page takes a non-compound higher-order page, and splits it into
1366 * n (1<<order) sub-pages: page[0..n] 1366 * n (1<<order) sub-pages: page[0..n]
1367 * Each sub-page must be freed individually. 1367 * Each sub-page must be freed individually.
1368 * 1368 *
1369 * Note: this is probably too low level an operation for use in drivers. 1369 * Note: this is probably too low level an operation for use in drivers.
1370 * Please consult with lkml before using this in your driver. 1370 * Please consult with lkml before using this in your driver.
1371 */ 1371 */
1372 void split_page(struct page *page, unsigned int order) 1372 void split_page(struct page *page, unsigned int order)
1373 { 1373 {
1374 int i; 1374 int i;
1375 1375
1376 VM_BUG_ON(PageCompound(page)); 1376 VM_BUG_ON(PageCompound(page));
1377 VM_BUG_ON(!page_count(page)); 1377 VM_BUG_ON(!page_count(page));
1378 1378
1379 #ifdef CONFIG_KMEMCHECK 1379 #ifdef CONFIG_KMEMCHECK
1380 /* 1380 /*
1381 * Split shadow pages too, because free(page[0]) would 1381 * Split shadow pages too, because free(page[0]) would
1382 * otherwise free the whole shadow. 1382 * otherwise free the whole shadow.
1383 */ 1383 */
1384 if (kmemcheck_page_is_tracked(page)) 1384 if (kmemcheck_page_is_tracked(page))
1385 split_page(virt_to_page(page[0].shadow), order); 1385 split_page(virt_to_page(page[0].shadow), order);
1386 #endif 1386 #endif
1387 1387
1388 for (i = 1; i < (1 << order); i++) 1388 for (i = 1; i < (1 << order); i++)
1389 set_page_refcounted(page + i); 1389 set_page_refcounted(page + i);
1390 } 1390 }
1391 1391
1392 static int __isolate_free_page(struct page *page, unsigned int order) 1392 static int __isolate_free_page(struct page *page, unsigned int order)
1393 { 1393 {
1394 unsigned long watermark; 1394 unsigned long watermark;
1395 struct zone *zone; 1395 struct zone *zone;
1396 int mt; 1396 int mt;
1397 1397
1398 BUG_ON(!PageBuddy(page)); 1398 BUG_ON(!PageBuddy(page));
1399 1399
1400 zone = page_zone(page); 1400 zone = page_zone(page);
1401 mt = get_pageblock_migratetype(page); 1401 mt = get_pageblock_migratetype(page);
1402 1402
1403 if (mt != MIGRATE_ISOLATE) { 1403 if (mt != MIGRATE_ISOLATE) {
1404 /* Obey watermarks as if the page was being allocated */ 1404 /* Obey watermarks as if the page was being allocated */
1405 watermark = low_wmark_pages(zone) + (1 << order); 1405 watermark = low_wmark_pages(zone) + (1 << order);
1406 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1406 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1407 return 0; 1407 return 0;
1408 1408
1409 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1409 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1410 } 1410 }
1411 1411
1412 /* Remove page from free list */ 1412 /* Remove page from free list */
1413 list_del(&page->lru); 1413 list_del(&page->lru);
1414 zone->free_area[order].nr_free--; 1414 zone->free_area[order].nr_free--;
1415 rmv_page_order(page); 1415 rmv_page_order(page);
1416 1416
1417 /* Set the pageblock if the isolated page is at least a pageblock */ 1417 /* Set the pageblock if the isolated page is at least a pageblock */
1418 if (order >= pageblock_order - 1) { 1418 if (order >= pageblock_order - 1) {
1419 struct page *endpage = page + (1 << order) - 1; 1419 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1420 for (; page < endpage; page += pageblock_nr_pages) {
1421 int mt = get_pageblock_migratetype(page); 1421 int mt = get_pageblock_migratetype(page);
1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1423 set_pageblock_migratetype(page, 1423 set_pageblock_migratetype(page,
1424 MIGRATE_MOVABLE); 1424 MIGRATE_MOVABLE);
1425 } 1425 }
1426 } 1426 }
1427 1427
1428 return 1UL << order; 1428 return 1UL << order;
1429 } 1429 }
1430 1430
1431 /* 1431 /*
1432 * Similar to split_page except the page is already free. As this is only 1432 * Similar to split_page except the page is already free. As this is only
1433 * being used for migration, the migratetype of the block also changes. 1433 * being used for migration, the migratetype of the block also changes.
1434 * As this is called with interrupts disabled, the caller is responsible 1434 * As this is called with interrupts disabled, the caller is responsible
1435 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1435 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1436 * are enabled. 1436 * are enabled.
1437 * 1437 *
1438 * Note: this is probably too low level an operation for use in drivers. 1438 * Note: this is probably too low level an operation for use in drivers.
1439 * Please consult with lkml before using this in your driver. 1439 * Please consult with lkml before using this in your driver.
1440 */ 1440 */
1441 int split_free_page(struct page *page) 1441 int split_free_page(struct page *page)
1442 { 1442 {
1443 unsigned int order; 1443 unsigned int order;
1444 int nr_pages; 1444 int nr_pages;
1445 1445
1446 order = page_order(page); 1446 order = page_order(page);
1447 1447
1448 nr_pages = __isolate_free_page(page, order); 1448 nr_pages = __isolate_free_page(page, order);
1449 if (!nr_pages) 1449 if (!nr_pages)
1450 return 0; 1450 return 0;
1451 1451
1452 /* Split into individual pages */ 1452 /* Split into individual pages */
1453 set_page_refcounted(page); 1453 set_page_refcounted(page);
1454 split_page(page, order); 1454 split_page(page, order);
1455 return nr_pages; 1455 return nr_pages;
1456 } 1456 }
1457 1457
1458 /* 1458 /*
1459 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1459 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1460 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1460 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1461 * or two. 1461 * or two.
1462 */ 1462 */
1463 static inline 1463 static inline
1464 struct page *buffered_rmqueue(struct zone *preferred_zone, 1464 struct page *buffered_rmqueue(struct zone *preferred_zone,
1465 struct zone *zone, int order, gfp_t gfp_flags, 1465 struct zone *zone, int order, gfp_t gfp_flags,
1466 int migratetype) 1466 int migratetype)
1467 { 1467 {
1468 unsigned long flags; 1468 unsigned long flags;
1469 struct page *page; 1469 struct page *page;
1470 int cold = !!(gfp_flags & __GFP_COLD); 1470 int cold = !!(gfp_flags & __GFP_COLD);
1471 1471
1472 again: 1472 again:
1473 if (likely(order == 0)) { 1473 if (likely(order == 0)) {
1474 struct per_cpu_pages *pcp; 1474 struct per_cpu_pages *pcp;
1475 struct list_head *list; 1475 struct list_head *list;
1476 1476
1477 local_irq_save(flags); 1477 local_irq_save(flags);
1478 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1478 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1479 list = &pcp->lists[migratetype]; 1479 list = &pcp->lists[migratetype];
1480 if (list_empty(list)) { 1480 if (list_empty(list)) {
1481 pcp->count += rmqueue_bulk(zone, 0, 1481 pcp->count += rmqueue_bulk(zone, 0,
1482 pcp->batch, list, 1482 pcp->batch, list,
1483 migratetype, cold); 1483 migratetype, cold);
1484 if (unlikely(list_empty(list))) 1484 if (unlikely(list_empty(list)))
1485 goto failed; 1485 goto failed;
1486 } 1486 }
1487 1487
1488 if (cold) 1488 if (cold)
1489 page = list_entry(list->prev, struct page, lru); 1489 page = list_entry(list->prev, struct page, lru);
1490 else 1490 else
1491 page = list_entry(list->next, struct page, lru); 1491 page = list_entry(list->next, struct page, lru);
1492 1492
1493 list_del(&page->lru); 1493 list_del(&page->lru);
1494 pcp->count--; 1494 pcp->count--;
1495 } else { 1495 } else {
1496 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1496 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1497 /* 1497 /*
1498 * __GFP_NOFAIL is not to be used in new code. 1498 * __GFP_NOFAIL is not to be used in new code.
1499 * 1499 *
1500 * All __GFP_NOFAIL callers should be fixed so that they 1500 * All __GFP_NOFAIL callers should be fixed so that they
1501 * properly detect and handle allocation failures. 1501 * properly detect and handle allocation failures.
1502 * 1502 *
1503 * We most definitely don't want callers attempting to 1503 * We most definitely don't want callers attempting to
1504 * allocate greater than order-1 page units with 1504 * allocate greater than order-1 page units with
1505 * __GFP_NOFAIL. 1505 * __GFP_NOFAIL.
1506 */ 1506 */
1507 WARN_ON_ONCE(order > 1); 1507 WARN_ON_ONCE(order > 1);
1508 } 1508 }
1509 spin_lock_irqsave(&zone->lock, flags); 1509 spin_lock_irqsave(&zone->lock, flags);
1510 page = __rmqueue(zone, order, migratetype); 1510 page = __rmqueue(zone, order, migratetype);
1511 spin_unlock(&zone->lock); 1511 spin_unlock(&zone->lock);
1512 if (!page) 1512 if (!page)
1513 goto failed; 1513 goto failed;
1514 __mod_zone_freepage_state(zone, -(1 << order), 1514 __mod_zone_freepage_state(zone, -(1 << order),
1515 get_pageblock_migratetype(page)); 1515 get_pageblock_migratetype(page));
1516 } 1516 }
1517 1517
1518 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1518 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1519 zone_statistics(preferred_zone, zone, gfp_flags); 1519 zone_statistics(preferred_zone, zone, gfp_flags);
1520 local_irq_restore(flags); 1520 local_irq_restore(flags);
1521 1521
1522 VM_BUG_ON(bad_range(zone, page)); 1522 VM_BUG_ON(bad_range(zone, page));
1523 if (prep_new_page(page, order, gfp_flags)) 1523 if (prep_new_page(page, order, gfp_flags))
1524 goto again; 1524 goto again;
1525 return page; 1525 return page;
1526 1526
1527 failed: 1527 failed:
1528 local_irq_restore(flags); 1528 local_irq_restore(flags);
1529 return NULL; 1529 return NULL;
1530 } 1530 }
1531 1531
1532 #ifdef CONFIG_FAIL_PAGE_ALLOC 1532 #ifdef CONFIG_FAIL_PAGE_ALLOC
1533 1533
1534 static struct { 1534 static struct {
1535 struct fault_attr attr; 1535 struct fault_attr attr;
1536 1536
1537 u32 ignore_gfp_highmem; 1537 u32 ignore_gfp_highmem;
1538 u32 ignore_gfp_wait; 1538 u32 ignore_gfp_wait;
1539 u32 min_order; 1539 u32 min_order;
1540 } fail_page_alloc = { 1540 } fail_page_alloc = {
1541 .attr = FAULT_ATTR_INITIALIZER, 1541 .attr = FAULT_ATTR_INITIALIZER,
1542 .ignore_gfp_wait = 1, 1542 .ignore_gfp_wait = 1,
1543 .ignore_gfp_highmem = 1, 1543 .ignore_gfp_highmem = 1,
1544 .min_order = 1, 1544 .min_order = 1,
1545 }; 1545 };
1546 1546
1547 static int __init setup_fail_page_alloc(char *str) 1547 static int __init setup_fail_page_alloc(char *str)
1548 { 1548 {
1549 return setup_fault_attr(&fail_page_alloc.attr, str); 1549 return setup_fault_attr(&fail_page_alloc.attr, str);
1550 } 1550 }
1551 __setup("fail_page_alloc=", setup_fail_page_alloc); 1551 __setup("fail_page_alloc=", setup_fail_page_alloc);
1552 1552
1553 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1553 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1554 { 1554 {
1555 if (order < fail_page_alloc.min_order) 1555 if (order < fail_page_alloc.min_order)
1556 return false; 1556 return false;
1557 if (gfp_mask & __GFP_NOFAIL) 1557 if (gfp_mask & __GFP_NOFAIL)
1558 return false; 1558 return false;
1559 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1559 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1560 return false; 1560 return false;
1561 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1561 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1562 return false; 1562 return false;
1563 1563
1564 return should_fail(&fail_page_alloc.attr, 1 << order); 1564 return should_fail(&fail_page_alloc.attr, 1 << order);
1565 } 1565 }
1566 1566
1567 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1567 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1568 1568
1569 static int __init fail_page_alloc_debugfs(void) 1569 static int __init fail_page_alloc_debugfs(void)
1570 { 1570 {
1571 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1571 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1572 struct dentry *dir; 1572 struct dentry *dir;
1573 1573
1574 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1574 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1575 &fail_page_alloc.attr); 1575 &fail_page_alloc.attr);
1576 if (IS_ERR(dir)) 1576 if (IS_ERR(dir))
1577 return PTR_ERR(dir); 1577 return PTR_ERR(dir);
1578 1578
1579 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1579 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1580 &fail_page_alloc.ignore_gfp_wait)) 1580 &fail_page_alloc.ignore_gfp_wait))
1581 goto fail; 1581 goto fail;
1582 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1582 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1583 &fail_page_alloc.ignore_gfp_highmem)) 1583 &fail_page_alloc.ignore_gfp_highmem))
1584 goto fail; 1584 goto fail;
1585 if (!debugfs_create_u32("min-order", mode, dir, 1585 if (!debugfs_create_u32("min-order", mode, dir,
1586 &fail_page_alloc.min_order)) 1586 &fail_page_alloc.min_order))
1587 goto fail; 1587 goto fail;
1588 1588
1589 return 0; 1589 return 0;
1590 fail: 1590 fail:
1591 debugfs_remove_recursive(dir); 1591 debugfs_remove_recursive(dir);
1592 1592
1593 return -ENOMEM; 1593 return -ENOMEM;
1594 } 1594 }
1595 1595
1596 late_initcall(fail_page_alloc_debugfs); 1596 late_initcall(fail_page_alloc_debugfs);
1597 1597
1598 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1598 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1599 1599
1600 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1600 #else /* CONFIG_FAIL_PAGE_ALLOC */
1601 1601
1602 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1602 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1603 { 1603 {
1604 return false; 1604 return false;
1605 } 1605 }
1606 1606
1607 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1607 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1608 1608
1609 /* 1609 /*
1610 * Return true if free pages are above 'mark'. This takes into account the order 1610 * Return true if free pages are above 'mark'. This takes into account the order
1611 * of the allocation. 1611 * of the allocation.
1612 */ 1612 */
1613 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1613 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1614 int classzone_idx, int alloc_flags, long free_pages) 1614 int classzone_idx, int alloc_flags, long free_pages)
1615 { 1615 {
1616 /* free_pages my go negative - that's OK */ 1616 /* free_pages my go negative - that's OK */
1617 long min = mark; 1617 long min = mark;
1618 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1618 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1619 int o; 1619 int o;
1620 1620
1621 free_pages -= (1 << order) - 1; 1621 free_pages -= (1 << order) - 1;
1622 if (alloc_flags & ALLOC_HIGH) 1622 if (alloc_flags & ALLOC_HIGH)
1623 min -= min / 2; 1623 min -= min / 2;
1624 if (alloc_flags & ALLOC_HARDER) 1624 if (alloc_flags & ALLOC_HARDER)
1625 min -= min / 4; 1625 min -= min / 4;
1626 #ifdef CONFIG_CMA 1626 #ifdef CONFIG_CMA
1627 /* If allocation can't use CMA areas don't use free CMA pages */ 1627 /* If allocation can't use CMA areas don't use free CMA pages */
1628 if (!(alloc_flags & ALLOC_CMA)) 1628 if (!(alloc_flags & ALLOC_CMA))
1629 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); 1629 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
1630 #endif 1630 #endif
1631 if (free_pages <= min + lowmem_reserve) 1631 if (free_pages <= min + lowmem_reserve)
1632 return false; 1632 return false;
1633 for (o = 0; o < order; o++) { 1633 for (o = 0; o < order; o++) {
1634 /* At the next order, this order's pages become unavailable */ 1634 /* At the next order, this order's pages become unavailable */
1635 free_pages -= z->free_area[o].nr_free << o; 1635 free_pages -= z->free_area[o].nr_free << o;
1636 1636
1637 /* Require fewer higher order pages to be free */ 1637 /* Require fewer higher order pages to be free */
1638 min >>= 1; 1638 min >>= 1;
1639 1639
1640 if (free_pages <= min) 1640 if (free_pages <= min)
1641 return false; 1641 return false;
1642 } 1642 }
1643 return true; 1643 return true;
1644 } 1644 }
1645 1645
1646 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1646 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1647 int classzone_idx, int alloc_flags) 1647 int classzone_idx, int alloc_flags)
1648 { 1648 {
1649 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1649 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1650 zone_page_state(z, NR_FREE_PAGES)); 1650 zone_page_state(z, NR_FREE_PAGES));
1651 } 1651 }
1652 1652
1653 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1653 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1654 int classzone_idx, int alloc_flags) 1654 int classzone_idx, int alloc_flags)
1655 { 1655 {
1656 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1656 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1657 1657
1658 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1658 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1659 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1659 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1660 1660
1661 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1661 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1662 free_pages); 1662 free_pages);
1663 } 1663 }
1664 1664
1665 #ifdef CONFIG_NUMA 1665 #ifdef CONFIG_NUMA
1666 /* 1666 /*
1667 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1667 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1668 * skip over zones that are not allowed by the cpuset, or that have 1668 * skip over zones that are not allowed by the cpuset, or that have
1669 * been recently (in last second) found to be nearly full. See further 1669 * been recently (in last second) found to be nearly full. See further
1670 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1670 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1671 * that have to skip over a lot of full or unallowed zones. 1671 * that have to skip over a lot of full or unallowed zones.
1672 * 1672 *
1673 * If the zonelist cache is present in the passed in zonelist, then 1673 * If the zonelist cache is present in the passed in zonelist, then
1674 * returns a pointer to the allowed node mask (either the current 1674 * returns a pointer to the allowed node mask (either the current
1675 * tasks mems_allowed, or node_states[N_MEMORY].) 1675 * tasks mems_allowed, or node_states[N_MEMORY].)
1676 * 1676 *
1677 * If the zonelist cache is not available for this zonelist, does 1677 * If the zonelist cache is not available for this zonelist, does
1678 * nothing and returns NULL. 1678 * nothing and returns NULL.
1679 * 1679 *
1680 * If the fullzones BITMAP in the zonelist cache is stale (more than 1680 * If the fullzones BITMAP in the zonelist cache is stale (more than
1681 * a second since last zap'd) then we zap it out (clear its bits.) 1681 * a second since last zap'd) then we zap it out (clear its bits.)
1682 * 1682 *
1683 * We hold off even calling zlc_setup, until after we've checked the 1683 * We hold off even calling zlc_setup, until after we've checked the
1684 * first zone in the zonelist, on the theory that most allocations will 1684 * first zone in the zonelist, on the theory that most allocations will
1685 * be satisfied from that first zone, so best to examine that zone as 1685 * be satisfied from that first zone, so best to examine that zone as
1686 * quickly as we can. 1686 * quickly as we can.
1687 */ 1687 */
1688 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1688 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1689 { 1689 {
1690 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1690 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1691 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1691 nodemask_t *allowednodes; /* zonelist_cache approximation */
1692 1692
1693 zlc = zonelist->zlcache_ptr; 1693 zlc = zonelist->zlcache_ptr;
1694 if (!zlc) 1694 if (!zlc)
1695 return NULL; 1695 return NULL;
1696 1696
1697 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1697 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1698 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1698 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1699 zlc->last_full_zap = jiffies; 1699 zlc->last_full_zap = jiffies;
1700 } 1700 }
1701 1701
1702 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1702 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1703 &cpuset_current_mems_allowed : 1703 &cpuset_current_mems_allowed :
1704 &node_states[N_MEMORY]; 1704 &node_states[N_MEMORY];
1705 return allowednodes; 1705 return allowednodes;
1706 } 1706 }
1707 1707
1708 /* 1708 /*
1709 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1709 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1710 * if it is worth looking at further for free memory: 1710 * if it is worth looking at further for free memory:
1711 * 1) Check that the zone isn't thought to be full (doesn't have its 1711 * 1) Check that the zone isn't thought to be full (doesn't have its
1712 * bit set in the zonelist_cache fullzones BITMAP). 1712 * bit set in the zonelist_cache fullzones BITMAP).
1713 * 2) Check that the zones node (obtained from the zonelist_cache 1713 * 2) Check that the zones node (obtained from the zonelist_cache
1714 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1714 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1715 * Return true (non-zero) if zone is worth looking at further, or 1715 * Return true (non-zero) if zone is worth looking at further, or
1716 * else return false (zero) if it is not. 1716 * else return false (zero) if it is not.
1717 * 1717 *
1718 * This check -ignores- the distinction between various watermarks, 1718 * This check -ignores- the distinction between various watermarks,
1719 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1719 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1720 * found to be full for any variation of these watermarks, it will 1720 * found to be full for any variation of these watermarks, it will
1721 * be considered full for up to one second by all requests, unless 1721 * be considered full for up to one second by all requests, unless
1722 * we are so low on memory on all allowed nodes that we are forced 1722 * we are so low on memory on all allowed nodes that we are forced
1723 * into the second scan of the zonelist. 1723 * into the second scan of the zonelist.
1724 * 1724 *
1725 * In the second scan we ignore this zonelist cache and exactly 1725 * In the second scan we ignore this zonelist cache and exactly
1726 * apply the watermarks to all zones, even it is slower to do so. 1726 * apply the watermarks to all zones, even it is slower to do so.
1727 * We are low on memory in the second scan, and should leave no stone 1727 * We are low on memory in the second scan, and should leave no stone
1728 * unturned looking for a free page. 1728 * unturned looking for a free page.
1729 */ 1729 */
1730 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1730 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1731 nodemask_t *allowednodes) 1731 nodemask_t *allowednodes)
1732 { 1732 {
1733 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1733 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1734 int i; /* index of *z in zonelist zones */ 1734 int i; /* index of *z in zonelist zones */
1735 int n; /* node that zone *z is on */ 1735 int n; /* node that zone *z is on */
1736 1736
1737 zlc = zonelist->zlcache_ptr; 1737 zlc = zonelist->zlcache_ptr;
1738 if (!zlc) 1738 if (!zlc)
1739 return 1; 1739 return 1;
1740 1740
1741 i = z - zonelist->_zonerefs; 1741 i = z - zonelist->_zonerefs;
1742 n = zlc->z_to_n[i]; 1742 n = zlc->z_to_n[i];
1743 1743
1744 /* This zone is worth trying if it is allowed but not full */ 1744 /* This zone is worth trying if it is allowed but not full */
1745 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1745 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1746 } 1746 }
1747 1747
1748 /* 1748 /*
1749 * Given 'z' scanning a zonelist, set the corresponding bit in 1749 * Given 'z' scanning a zonelist, set the corresponding bit in
1750 * zlc->fullzones, so that subsequent attempts to allocate a page 1750 * zlc->fullzones, so that subsequent attempts to allocate a page
1751 * from that zone don't waste time re-examining it. 1751 * from that zone don't waste time re-examining it.
1752 */ 1752 */
1753 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1753 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1754 { 1754 {
1755 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1755 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1756 int i; /* index of *z in zonelist zones */ 1756 int i; /* index of *z in zonelist zones */
1757 1757
1758 zlc = zonelist->zlcache_ptr; 1758 zlc = zonelist->zlcache_ptr;
1759 if (!zlc) 1759 if (!zlc)
1760 return; 1760 return;
1761 1761
1762 i = z - zonelist->_zonerefs; 1762 i = z - zonelist->_zonerefs;
1763 1763
1764 set_bit(i, zlc->fullzones); 1764 set_bit(i, zlc->fullzones);
1765 } 1765 }
1766 1766
1767 /* 1767 /*
1768 * clear all zones full, called after direct reclaim makes progress so that 1768 * clear all zones full, called after direct reclaim makes progress so that
1769 * a zone that was recently full is not skipped over for up to a second 1769 * a zone that was recently full is not skipped over for up to a second
1770 */ 1770 */
1771 static void zlc_clear_zones_full(struct zonelist *zonelist) 1771 static void zlc_clear_zones_full(struct zonelist *zonelist)
1772 { 1772 {
1773 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1773 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1774 1774
1775 zlc = zonelist->zlcache_ptr; 1775 zlc = zonelist->zlcache_ptr;
1776 if (!zlc) 1776 if (!zlc)
1777 return; 1777 return;
1778 1778
1779 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1779 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1780 } 1780 }
1781 1781
1782 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1782 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1783 { 1783 {
1784 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1784 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1785 } 1785 }
1786 1786
1787 static void __paginginit init_zone_allows_reclaim(int nid) 1787 static void __paginginit init_zone_allows_reclaim(int nid)
1788 { 1788 {
1789 int i; 1789 int i;
1790 1790
1791 for_each_online_node(i) 1791 for_each_online_node(i)
1792 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1792 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1793 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1793 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1794 else 1794 else
1795 zone_reclaim_mode = 1; 1795 zone_reclaim_mode = 1;
1796 } 1796 }
1797 1797
1798 #else /* CONFIG_NUMA */ 1798 #else /* CONFIG_NUMA */
1799 1799
1800 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1800 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1801 { 1801 {
1802 return NULL; 1802 return NULL;
1803 } 1803 }
1804 1804
1805 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1805 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1806 nodemask_t *allowednodes) 1806 nodemask_t *allowednodes)
1807 { 1807 {
1808 return 1; 1808 return 1;
1809 } 1809 }
1810 1810
1811 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1811 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1812 { 1812 {
1813 } 1813 }
1814 1814
1815 static void zlc_clear_zones_full(struct zonelist *zonelist) 1815 static void zlc_clear_zones_full(struct zonelist *zonelist)
1816 { 1816 {
1817 } 1817 }
1818 1818
1819 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1819 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1820 { 1820 {
1821 return true; 1821 return true;
1822 } 1822 }
1823 1823
1824 static inline void init_zone_allows_reclaim(int nid) 1824 static inline void init_zone_allows_reclaim(int nid)
1825 { 1825 {
1826 } 1826 }
1827 #endif /* CONFIG_NUMA */ 1827 #endif /* CONFIG_NUMA */
1828 1828
1829 /* 1829 /*
1830 * get_page_from_freelist goes through the zonelist trying to allocate 1830 * get_page_from_freelist goes through the zonelist trying to allocate
1831 * a page. 1831 * a page.
1832 */ 1832 */
1833 static struct page * 1833 static struct page *
1834 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1834 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1835 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1835 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1836 struct zone *preferred_zone, int migratetype) 1836 struct zone *preferred_zone, int migratetype)
1837 { 1837 {
1838 struct zoneref *z; 1838 struct zoneref *z;
1839 struct page *page = NULL; 1839 struct page *page = NULL;
1840 int classzone_idx; 1840 int classzone_idx;
1841 struct zone *zone; 1841 struct zone *zone;
1842 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1842 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1843 int zlc_active = 0; /* set if using zonelist_cache */ 1843 int zlc_active = 0; /* set if using zonelist_cache */
1844 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1844 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1845 1845
1846 classzone_idx = zone_idx(preferred_zone); 1846 classzone_idx = zone_idx(preferred_zone);
1847 zonelist_scan: 1847 zonelist_scan:
1848 /* 1848 /*
1849 * Scan zonelist, looking for a zone with enough free. 1849 * Scan zonelist, looking for a zone with enough free.
1850 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1850 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1851 */ 1851 */
1852 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1852 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1853 high_zoneidx, nodemask) { 1853 high_zoneidx, nodemask) {
1854 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1854 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1855 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1855 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1856 continue; 1856 continue;
1857 if ((alloc_flags & ALLOC_CPUSET) && 1857 if ((alloc_flags & ALLOC_CPUSET) &&
1858 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1858 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1859 continue; 1859 continue;
1860 /* 1860 /*
1861 * When allocating a page cache page for writing, we 1861 * When allocating a page cache page for writing, we
1862 * want to get it from a zone that is within its dirty 1862 * want to get it from a zone that is within its dirty
1863 * limit, such that no single zone holds more than its 1863 * limit, such that no single zone holds more than its
1864 * proportional share of globally allowed dirty pages. 1864 * proportional share of globally allowed dirty pages.
1865 * The dirty limits take into account the zone's 1865 * The dirty limits take into account the zone's
1866 * lowmem reserves and high watermark so that kswapd 1866 * lowmem reserves and high watermark so that kswapd
1867 * should be able to balance it without having to 1867 * should be able to balance it without having to
1868 * write pages from its LRU list. 1868 * write pages from its LRU list.
1869 * 1869 *
1870 * This may look like it could increase pressure on 1870 * This may look like it could increase pressure on
1871 * lower zones by failing allocations in higher zones 1871 * lower zones by failing allocations in higher zones
1872 * before they are full. But the pages that do spill 1872 * before they are full. But the pages that do spill
1873 * over are limited as the lower zones are protected 1873 * over are limited as the lower zones are protected
1874 * by this very same mechanism. It should not become 1874 * by this very same mechanism. It should not become
1875 * a practical burden to them. 1875 * a practical burden to them.
1876 * 1876 *
1877 * XXX: For now, allow allocations to potentially 1877 * XXX: For now, allow allocations to potentially
1878 * exceed the per-zone dirty limit in the slowpath 1878 * exceed the per-zone dirty limit in the slowpath
1879 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1879 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1880 * which is important when on a NUMA setup the allowed 1880 * which is important when on a NUMA setup the allowed
1881 * zones are together not big enough to reach the 1881 * zones are together not big enough to reach the
1882 * global limit. The proper fix for these situations 1882 * global limit. The proper fix for these situations
1883 * will require awareness of zones in the 1883 * will require awareness of zones in the
1884 * dirty-throttling and the flusher threads. 1884 * dirty-throttling and the flusher threads.
1885 */ 1885 */
1886 if ((alloc_flags & ALLOC_WMARK_LOW) && 1886 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1887 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1887 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1888 goto this_zone_full; 1888 goto this_zone_full;
1889 1889
1890 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1890 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1891 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1891 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1892 unsigned long mark; 1892 unsigned long mark;
1893 int ret; 1893 int ret;
1894 1894
1895 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1895 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1896 if (zone_watermark_ok(zone, order, mark, 1896 if (zone_watermark_ok(zone, order, mark,
1897 classzone_idx, alloc_flags)) 1897 classzone_idx, alloc_flags))
1898 goto try_this_zone; 1898 goto try_this_zone;
1899 1899
1900 if (IS_ENABLED(CONFIG_NUMA) && 1900 if (IS_ENABLED(CONFIG_NUMA) &&
1901 !did_zlc_setup && nr_online_nodes > 1) { 1901 !did_zlc_setup && nr_online_nodes > 1) {
1902 /* 1902 /*
1903 * we do zlc_setup if there are multiple nodes 1903 * we do zlc_setup if there are multiple nodes
1904 * and before considering the first zone allowed 1904 * and before considering the first zone allowed
1905 * by the cpuset. 1905 * by the cpuset.
1906 */ 1906 */
1907 allowednodes = zlc_setup(zonelist, alloc_flags); 1907 allowednodes = zlc_setup(zonelist, alloc_flags);
1908 zlc_active = 1; 1908 zlc_active = 1;
1909 did_zlc_setup = 1; 1909 did_zlc_setup = 1;
1910 } 1910 }
1911 1911
1912 if (zone_reclaim_mode == 0 || 1912 if (zone_reclaim_mode == 0 ||
1913 !zone_allows_reclaim(preferred_zone, zone)) 1913 !zone_allows_reclaim(preferred_zone, zone))
1914 goto this_zone_full; 1914 goto this_zone_full;
1915 1915
1916 /* 1916 /*
1917 * As we may have just activated ZLC, check if the first 1917 * As we may have just activated ZLC, check if the first
1918 * eligible zone has failed zone_reclaim recently. 1918 * eligible zone has failed zone_reclaim recently.
1919 */ 1919 */
1920 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1920 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1921 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1921 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1922 continue; 1922 continue;
1923 1923
1924 ret = zone_reclaim(zone, gfp_mask, order); 1924 ret = zone_reclaim(zone, gfp_mask, order);
1925 switch (ret) { 1925 switch (ret) {
1926 case ZONE_RECLAIM_NOSCAN: 1926 case ZONE_RECLAIM_NOSCAN:
1927 /* did not scan */ 1927 /* did not scan */
1928 continue; 1928 continue;
1929 case ZONE_RECLAIM_FULL: 1929 case ZONE_RECLAIM_FULL:
1930 /* scanned but unreclaimable */ 1930 /* scanned but unreclaimable */
1931 continue; 1931 continue;
1932 default: 1932 default:
1933 /* did we reclaim enough */ 1933 /* did we reclaim enough */
1934 if (!zone_watermark_ok(zone, order, mark, 1934 if (!zone_watermark_ok(zone, order, mark,
1935 classzone_idx, alloc_flags)) 1935 classzone_idx, alloc_flags))
1936 goto this_zone_full; 1936 goto this_zone_full;
1937 } 1937 }
1938 } 1938 }
1939 1939
1940 try_this_zone: 1940 try_this_zone:
1941 page = buffered_rmqueue(preferred_zone, zone, order, 1941 page = buffered_rmqueue(preferred_zone, zone, order,
1942 gfp_mask, migratetype); 1942 gfp_mask, migratetype);
1943 if (page) 1943 if (page)
1944 break; 1944 break;
1945 this_zone_full: 1945 this_zone_full:
1946 if (IS_ENABLED(CONFIG_NUMA)) 1946 if (IS_ENABLED(CONFIG_NUMA))
1947 zlc_mark_zone_full(zonelist, z); 1947 zlc_mark_zone_full(zonelist, z);
1948 } 1948 }
1949 1949
1950 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 1950 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1951 /* Disable zlc cache for second zonelist scan */ 1951 /* Disable zlc cache for second zonelist scan */
1952 zlc_active = 0; 1952 zlc_active = 0;
1953 goto zonelist_scan; 1953 goto zonelist_scan;
1954 } 1954 }
1955 1955
1956 if (page) 1956 if (page)
1957 /* 1957 /*
1958 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 1958 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
1959 * necessary to allocate the page. The expectation is 1959 * necessary to allocate the page. The expectation is
1960 * that the caller is taking steps that will free more 1960 * that the caller is taking steps that will free more
1961 * memory. The caller should avoid the page being used 1961 * memory. The caller should avoid the page being used
1962 * for !PFMEMALLOC purposes. 1962 * for !PFMEMALLOC purposes.
1963 */ 1963 */
1964 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 1964 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1965 1965
1966 return page; 1966 return page;
1967 } 1967 }
1968 1968
1969 /* 1969 /*
1970 * Large machines with many possible nodes should not always dump per-node 1970 * Large machines with many possible nodes should not always dump per-node
1971 * meminfo in irq context. 1971 * meminfo in irq context.
1972 */ 1972 */
1973 static inline bool should_suppress_show_mem(void) 1973 static inline bool should_suppress_show_mem(void)
1974 { 1974 {
1975 bool ret = false; 1975 bool ret = false;
1976 1976
1977 #if NODES_SHIFT > 8 1977 #if NODES_SHIFT > 8
1978 ret = in_interrupt(); 1978 ret = in_interrupt();
1979 #endif 1979 #endif
1980 return ret; 1980 return ret;
1981 } 1981 }
1982 1982
1983 static DEFINE_RATELIMIT_STATE(nopage_rs, 1983 static DEFINE_RATELIMIT_STATE(nopage_rs,
1984 DEFAULT_RATELIMIT_INTERVAL, 1984 DEFAULT_RATELIMIT_INTERVAL,
1985 DEFAULT_RATELIMIT_BURST); 1985 DEFAULT_RATELIMIT_BURST);
1986 1986
1987 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 1987 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1988 { 1988 {
1989 unsigned int filter = SHOW_MEM_FILTER_NODES; 1989 unsigned int filter = SHOW_MEM_FILTER_NODES;
1990 1990
1991 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 1991 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1992 debug_guardpage_minorder() > 0) 1992 debug_guardpage_minorder() > 0)
1993 return; 1993 return;
1994 1994
1995 /* 1995 /*
1996 * This documents exceptions given to allocations in certain 1996 * This documents exceptions given to allocations in certain
1997 * contexts that are allowed to allocate outside current's set 1997 * contexts that are allowed to allocate outside current's set
1998 * of allowed nodes. 1998 * of allowed nodes.
1999 */ 1999 */
2000 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2000 if (!(gfp_mask & __GFP_NOMEMALLOC))
2001 if (test_thread_flag(TIF_MEMDIE) || 2001 if (test_thread_flag(TIF_MEMDIE) ||
2002 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2002 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2003 filter &= ~SHOW_MEM_FILTER_NODES; 2003 filter &= ~SHOW_MEM_FILTER_NODES;
2004 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2004 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2005 filter &= ~SHOW_MEM_FILTER_NODES; 2005 filter &= ~SHOW_MEM_FILTER_NODES;
2006 2006
2007 if (fmt) { 2007 if (fmt) {
2008 struct va_format vaf; 2008 struct va_format vaf;
2009 va_list args; 2009 va_list args;
2010 2010
2011 va_start(args, fmt); 2011 va_start(args, fmt);
2012 2012
2013 vaf.fmt = fmt; 2013 vaf.fmt = fmt;
2014 vaf.va = &args; 2014 vaf.va = &args;
2015 2015
2016 pr_warn("%pV", &vaf); 2016 pr_warn("%pV", &vaf);
2017 2017
2018 va_end(args); 2018 va_end(args);
2019 } 2019 }
2020 2020
2021 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2021 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2022 current->comm, order, gfp_mask); 2022 current->comm, order, gfp_mask);
2023 2023
2024 dump_stack(); 2024 dump_stack();
2025 if (!should_suppress_show_mem()) 2025 if (!should_suppress_show_mem())
2026 show_mem(filter); 2026 show_mem(filter);
2027 } 2027 }
2028 2028
2029 static inline int 2029 static inline int
2030 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2030 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2031 unsigned long did_some_progress, 2031 unsigned long did_some_progress,
2032 unsigned long pages_reclaimed) 2032 unsigned long pages_reclaimed)
2033 { 2033 {
2034 /* Do not loop if specifically requested */ 2034 /* Do not loop if specifically requested */
2035 if (gfp_mask & __GFP_NORETRY) 2035 if (gfp_mask & __GFP_NORETRY)
2036 return 0; 2036 return 0;
2037 2037
2038 /* Always retry if specifically requested */ 2038 /* Always retry if specifically requested */
2039 if (gfp_mask & __GFP_NOFAIL) 2039 if (gfp_mask & __GFP_NOFAIL)
2040 return 1; 2040 return 1;
2041 2041
2042 /* 2042 /*
2043 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2043 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2044 * making forward progress without invoking OOM. Suspend also disables 2044 * making forward progress without invoking OOM. Suspend also disables
2045 * storage devices so kswapd will not help. Bail if we are suspending. 2045 * storage devices so kswapd will not help. Bail if we are suspending.
2046 */ 2046 */
2047 if (!did_some_progress && pm_suspended_storage()) 2047 if (!did_some_progress && pm_suspended_storage())
2048 return 0; 2048 return 0;
2049 2049
2050 /* 2050 /*
2051 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2051 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2052 * means __GFP_NOFAIL, but that may not be true in other 2052 * means __GFP_NOFAIL, but that may not be true in other
2053 * implementations. 2053 * implementations.
2054 */ 2054 */
2055 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2055 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2056 return 1; 2056 return 1;
2057 2057
2058 /* 2058 /*
2059 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2059 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2060 * specified, then we retry until we no longer reclaim any pages 2060 * specified, then we retry until we no longer reclaim any pages
2061 * (above), or we've reclaimed an order of pages at least as 2061 * (above), or we've reclaimed an order of pages at least as
2062 * large as the allocation's order. In both cases, if the 2062 * large as the allocation's order. In both cases, if the
2063 * allocation still fails, we stop retrying. 2063 * allocation still fails, we stop retrying.
2064 */ 2064 */
2065 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2065 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2066 return 1; 2066 return 1;
2067 2067
2068 return 0; 2068 return 0;
2069 } 2069 }
2070 2070
2071 static inline struct page * 2071 static inline struct page *
2072 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2072 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2073 struct zonelist *zonelist, enum zone_type high_zoneidx, 2073 struct zonelist *zonelist, enum zone_type high_zoneidx,
2074 nodemask_t *nodemask, struct zone *preferred_zone, 2074 nodemask_t *nodemask, struct zone *preferred_zone,
2075 int migratetype) 2075 int migratetype)
2076 { 2076 {
2077 struct page *page; 2077 struct page *page;
2078 2078
2079 /* Acquire the OOM killer lock for the zones in zonelist */ 2079 /* Acquire the OOM killer lock for the zones in zonelist */
2080 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2080 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2081 schedule_timeout_uninterruptible(1); 2081 schedule_timeout_uninterruptible(1);
2082 return NULL; 2082 return NULL;
2083 } 2083 }
2084 2084
2085 /* 2085 /*
2086 * Go through the zonelist yet one more time, keep very high watermark 2086 * Go through the zonelist yet one more time, keep very high watermark
2087 * here, this is only to catch a parallel oom killing, we must fail if 2087 * here, this is only to catch a parallel oom killing, we must fail if
2088 * we're still under heavy pressure. 2088 * we're still under heavy pressure.
2089 */ 2089 */
2090 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2090 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2091 order, zonelist, high_zoneidx, 2091 order, zonelist, high_zoneidx,
2092 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2092 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2093 preferred_zone, migratetype); 2093 preferred_zone, migratetype);
2094 if (page) 2094 if (page)
2095 goto out; 2095 goto out;
2096 2096
2097 if (!(gfp_mask & __GFP_NOFAIL)) { 2097 if (!(gfp_mask & __GFP_NOFAIL)) {
2098 /* The OOM killer will not help higher order allocs */ 2098 /* The OOM killer will not help higher order allocs */
2099 if (order > PAGE_ALLOC_COSTLY_ORDER) 2099 if (order > PAGE_ALLOC_COSTLY_ORDER)
2100 goto out; 2100 goto out;
2101 /* The OOM killer does not needlessly kill tasks for lowmem */ 2101 /* The OOM killer does not needlessly kill tasks for lowmem */
2102 if (high_zoneidx < ZONE_NORMAL) 2102 if (high_zoneidx < ZONE_NORMAL)
2103 goto out; 2103 goto out;
2104 /* 2104 /*
2105 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2105 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2106 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2106 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2107 * The caller should handle page allocation failure by itself if 2107 * The caller should handle page allocation failure by itself if
2108 * it specifies __GFP_THISNODE. 2108 * it specifies __GFP_THISNODE.
2109 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2109 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2110 */ 2110 */
2111 if (gfp_mask & __GFP_THISNODE) 2111 if (gfp_mask & __GFP_THISNODE)
2112 goto out; 2112 goto out;
2113 } 2113 }
2114 /* Exhausted what can be done so it's blamo time */ 2114 /* Exhausted what can be done so it's blamo time */
2115 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2115 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2116 2116
2117 out: 2117 out:
2118 clear_zonelist_oom(zonelist, gfp_mask); 2118 clear_zonelist_oom(zonelist, gfp_mask);
2119 return page; 2119 return page;
2120 } 2120 }
2121 2121
2122 #ifdef CONFIG_COMPACTION 2122 #ifdef CONFIG_COMPACTION
2123 /* Try memory compaction for high-order allocations before reclaim */ 2123 /* Try memory compaction for high-order allocations before reclaim */
2124 static struct page * 2124 static struct page *
2125 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2125 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2126 struct zonelist *zonelist, enum zone_type high_zoneidx, 2126 struct zonelist *zonelist, enum zone_type high_zoneidx,
2127 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2127 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2128 int migratetype, bool sync_migration, 2128 int migratetype, bool sync_migration,
2129 bool *contended_compaction, bool *deferred_compaction, 2129 bool *contended_compaction, bool *deferred_compaction,
2130 unsigned long *did_some_progress) 2130 unsigned long *did_some_progress)
2131 { 2131 {
2132 if (!order) 2132 if (!order)
2133 return NULL; 2133 return NULL;
2134 2134
2135 if (compaction_deferred(preferred_zone, order)) { 2135 if (compaction_deferred(preferred_zone, order)) {
2136 *deferred_compaction = true; 2136 *deferred_compaction = true;
2137 return NULL; 2137 return NULL;
2138 } 2138 }
2139 2139
2140 current->flags |= PF_MEMALLOC; 2140 current->flags |= PF_MEMALLOC;
2141 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2141 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2142 nodemask, sync_migration, 2142 nodemask, sync_migration,
2143 contended_compaction); 2143 contended_compaction);
2144 current->flags &= ~PF_MEMALLOC; 2144 current->flags &= ~PF_MEMALLOC;
2145 2145
2146 if (*did_some_progress != COMPACT_SKIPPED) { 2146 if (*did_some_progress != COMPACT_SKIPPED) {
2147 struct page *page; 2147 struct page *page;
2148 2148
2149 /* Page migration frees to the PCP lists but we want merging */ 2149 /* Page migration frees to the PCP lists but we want merging */
2150 drain_pages(get_cpu()); 2150 drain_pages(get_cpu());
2151 put_cpu(); 2151 put_cpu();
2152 2152
2153 page = get_page_from_freelist(gfp_mask, nodemask, 2153 page = get_page_from_freelist(gfp_mask, nodemask,
2154 order, zonelist, high_zoneidx, 2154 order, zonelist, high_zoneidx,
2155 alloc_flags & ~ALLOC_NO_WATERMARKS, 2155 alloc_flags & ~ALLOC_NO_WATERMARKS,
2156 preferred_zone, migratetype); 2156 preferred_zone, migratetype);
2157 if (page) { 2157 if (page) {
2158 preferred_zone->compact_blockskip_flush = false; 2158 preferred_zone->compact_blockskip_flush = false;
2159 preferred_zone->compact_considered = 0; 2159 preferred_zone->compact_considered = 0;
2160 preferred_zone->compact_defer_shift = 0; 2160 preferred_zone->compact_defer_shift = 0;
2161 if (order >= preferred_zone->compact_order_failed) 2161 if (order >= preferred_zone->compact_order_failed)
2162 preferred_zone->compact_order_failed = order + 1; 2162 preferred_zone->compact_order_failed = order + 1;
2163 count_vm_event(COMPACTSUCCESS); 2163 count_vm_event(COMPACTSUCCESS);
2164 return page; 2164 return page;
2165 } 2165 }
2166 2166
2167 /* 2167 /*
2168 * It's bad if compaction run occurs and fails. 2168 * It's bad if compaction run occurs and fails.
2169 * The most likely reason is that pages exist, 2169 * The most likely reason is that pages exist,
2170 * but not enough to satisfy watermarks. 2170 * but not enough to satisfy watermarks.
2171 */ 2171 */
2172 count_vm_event(COMPACTFAIL); 2172 count_vm_event(COMPACTFAIL);
2173 2173
2174 /* 2174 /*
2175 * As async compaction considers a subset of pageblocks, only 2175 * As async compaction considers a subset of pageblocks, only
2176 * defer if the failure was a sync compaction failure. 2176 * defer if the failure was a sync compaction failure.
2177 */ 2177 */
2178 if (sync_migration) 2178 if (sync_migration)
2179 defer_compaction(preferred_zone, order); 2179 defer_compaction(preferred_zone, order);
2180 2180
2181 cond_resched(); 2181 cond_resched();
2182 } 2182 }
2183 2183
2184 return NULL; 2184 return NULL;
2185 } 2185 }
2186 #else 2186 #else
2187 static inline struct page * 2187 static inline struct page *
2188 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2188 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2189 struct zonelist *zonelist, enum zone_type high_zoneidx, 2189 struct zonelist *zonelist, enum zone_type high_zoneidx,
2190 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2190 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2191 int migratetype, bool sync_migration, 2191 int migratetype, bool sync_migration,
2192 bool *contended_compaction, bool *deferred_compaction, 2192 bool *contended_compaction, bool *deferred_compaction,
2193 unsigned long *did_some_progress) 2193 unsigned long *did_some_progress)
2194 { 2194 {
2195 return NULL; 2195 return NULL;
2196 } 2196 }
2197 #endif /* CONFIG_COMPACTION */ 2197 #endif /* CONFIG_COMPACTION */
2198 2198
2199 /* Perform direct synchronous page reclaim */ 2199 /* Perform direct synchronous page reclaim */
2200 static int 2200 static int
2201 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2201 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2202 nodemask_t *nodemask) 2202 nodemask_t *nodemask)
2203 { 2203 {
2204 struct reclaim_state reclaim_state; 2204 struct reclaim_state reclaim_state;
2205 int progress; 2205 int progress;
2206 2206
2207 cond_resched(); 2207 cond_resched();
2208 2208
2209 /* We now go into synchronous reclaim */ 2209 /* We now go into synchronous reclaim */
2210 cpuset_memory_pressure_bump(); 2210 cpuset_memory_pressure_bump();
2211 current->flags |= PF_MEMALLOC; 2211 current->flags |= PF_MEMALLOC;
2212 lockdep_set_current_reclaim_state(gfp_mask); 2212 lockdep_set_current_reclaim_state(gfp_mask);
2213 reclaim_state.reclaimed_slab = 0; 2213 reclaim_state.reclaimed_slab = 0;
2214 current->reclaim_state = &reclaim_state; 2214 current->reclaim_state = &reclaim_state;
2215 2215
2216 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2216 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2217 2217
2218 current->reclaim_state = NULL; 2218 current->reclaim_state = NULL;
2219 lockdep_clear_current_reclaim_state(); 2219 lockdep_clear_current_reclaim_state();
2220 current->flags &= ~PF_MEMALLOC; 2220 current->flags &= ~PF_MEMALLOC;
2221 2221
2222 cond_resched(); 2222 cond_resched();
2223 2223
2224 return progress; 2224 return progress;
2225 } 2225 }
2226 2226
2227 /* The really slow allocator path where we enter direct reclaim */ 2227 /* The really slow allocator path where we enter direct reclaim */
2228 static inline struct page * 2228 static inline struct page *
2229 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2229 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2230 struct zonelist *zonelist, enum zone_type high_zoneidx, 2230 struct zonelist *zonelist, enum zone_type high_zoneidx,
2231 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2231 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2232 int migratetype, unsigned long *did_some_progress) 2232 int migratetype, unsigned long *did_some_progress)
2233 { 2233 {
2234 struct page *page = NULL; 2234 struct page *page = NULL;
2235 bool drained = false; 2235 bool drained = false;
2236 2236
2237 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2237 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2238 nodemask); 2238 nodemask);
2239 if (unlikely(!(*did_some_progress))) 2239 if (unlikely(!(*did_some_progress)))
2240 return NULL; 2240 return NULL;
2241 2241
2242 /* After successful reclaim, reconsider all zones for allocation */ 2242 /* After successful reclaim, reconsider all zones for allocation */
2243 if (IS_ENABLED(CONFIG_NUMA)) 2243 if (IS_ENABLED(CONFIG_NUMA))
2244 zlc_clear_zones_full(zonelist); 2244 zlc_clear_zones_full(zonelist);
2245 2245
2246 retry: 2246 retry:
2247 page = get_page_from_freelist(gfp_mask, nodemask, order, 2247 page = get_page_from_freelist(gfp_mask, nodemask, order,
2248 zonelist, high_zoneidx, 2248 zonelist, high_zoneidx,
2249 alloc_flags & ~ALLOC_NO_WATERMARKS, 2249 alloc_flags & ~ALLOC_NO_WATERMARKS,
2250 preferred_zone, migratetype); 2250 preferred_zone, migratetype);
2251 2251
2252 /* 2252 /*
2253 * If an allocation failed after direct reclaim, it could be because 2253 * If an allocation failed after direct reclaim, it could be because
2254 * pages are pinned on the per-cpu lists. Drain them and try again 2254 * pages are pinned on the per-cpu lists. Drain them and try again
2255 */ 2255 */
2256 if (!page && !drained) { 2256 if (!page && !drained) {
2257 drain_all_pages(); 2257 drain_all_pages();
2258 drained = true; 2258 drained = true;
2259 goto retry; 2259 goto retry;
2260 } 2260 }
2261 2261
2262 return page; 2262 return page;
2263 } 2263 }
2264 2264
2265 /* 2265 /*
2266 * This is called in the allocator slow-path if the allocation request is of 2266 * This is called in the allocator slow-path if the allocation request is of
2267 * sufficient urgency to ignore watermarks and take other desperate measures 2267 * sufficient urgency to ignore watermarks and take other desperate measures
2268 */ 2268 */
2269 static inline struct page * 2269 static inline struct page *
2270 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2270 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2271 struct zonelist *zonelist, enum zone_type high_zoneidx, 2271 struct zonelist *zonelist, enum zone_type high_zoneidx,
2272 nodemask_t *nodemask, struct zone *preferred_zone, 2272 nodemask_t *nodemask, struct zone *preferred_zone,
2273 int migratetype) 2273 int migratetype)
2274 { 2274 {
2275 struct page *page; 2275 struct page *page;
2276 2276
2277 do { 2277 do {
2278 page = get_page_from_freelist(gfp_mask, nodemask, order, 2278 page = get_page_from_freelist(gfp_mask, nodemask, order,
2279 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2279 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2280 preferred_zone, migratetype); 2280 preferred_zone, migratetype);
2281 2281
2282 if (!page && gfp_mask & __GFP_NOFAIL) 2282 if (!page && gfp_mask & __GFP_NOFAIL)
2283 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2283 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2284 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2284 } while (!page && (gfp_mask & __GFP_NOFAIL));
2285 2285
2286 return page; 2286 return page;
2287 } 2287 }
2288 2288
2289 static inline 2289 static inline
2290 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2290 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2291 enum zone_type high_zoneidx, 2291 enum zone_type high_zoneidx,
2292 enum zone_type classzone_idx) 2292 enum zone_type classzone_idx)
2293 { 2293 {
2294 struct zoneref *z; 2294 struct zoneref *z;
2295 struct zone *zone; 2295 struct zone *zone;
2296 2296
2297 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2297 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2298 wakeup_kswapd(zone, order, classzone_idx); 2298 wakeup_kswapd(zone, order, classzone_idx);
2299 } 2299 }
2300 2300
2301 static inline int 2301 static inline int
2302 gfp_to_alloc_flags(gfp_t gfp_mask) 2302 gfp_to_alloc_flags(gfp_t gfp_mask)
2303 { 2303 {
2304 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2304 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2305 const gfp_t wait = gfp_mask & __GFP_WAIT; 2305 const gfp_t wait = gfp_mask & __GFP_WAIT;
2306 2306
2307 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2307 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2308 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2308 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2309 2309
2310 /* 2310 /*
2311 * The caller may dip into page reserves a bit more if the caller 2311 * The caller may dip into page reserves a bit more if the caller
2312 * cannot run direct reclaim, or if the caller has realtime scheduling 2312 * cannot run direct reclaim, or if the caller has realtime scheduling
2313 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2313 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2314 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2314 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
2315 */ 2315 */
2316 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2316 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2317 2317
2318 if (!wait) { 2318 if (!wait) {
2319 /* 2319 /*
2320 * Not worth trying to allocate harder for 2320 * Not worth trying to allocate harder for
2321 * __GFP_NOMEMALLOC even if it can't schedule. 2321 * __GFP_NOMEMALLOC even if it can't schedule.
2322 */ 2322 */
2323 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2323 if (!(gfp_mask & __GFP_NOMEMALLOC))
2324 alloc_flags |= ALLOC_HARDER; 2324 alloc_flags |= ALLOC_HARDER;
2325 /* 2325 /*
2326 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2326 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
2327 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2327 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
2328 */ 2328 */
2329 alloc_flags &= ~ALLOC_CPUSET; 2329 alloc_flags &= ~ALLOC_CPUSET;
2330 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2330 } else if (unlikely(rt_task(current)) && !in_interrupt())
2331 alloc_flags |= ALLOC_HARDER; 2331 alloc_flags |= ALLOC_HARDER;
2332 2332
2333 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2333 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2334 if (gfp_mask & __GFP_MEMALLOC) 2334 if (gfp_mask & __GFP_MEMALLOC)
2335 alloc_flags |= ALLOC_NO_WATERMARKS; 2335 alloc_flags |= ALLOC_NO_WATERMARKS;
2336 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2336 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2337 alloc_flags |= ALLOC_NO_WATERMARKS; 2337 alloc_flags |= ALLOC_NO_WATERMARKS;
2338 else if (!in_interrupt() && 2338 else if (!in_interrupt() &&
2339 ((current->flags & PF_MEMALLOC) || 2339 ((current->flags & PF_MEMALLOC) ||
2340 unlikely(test_thread_flag(TIF_MEMDIE)))) 2340 unlikely(test_thread_flag(TIF_MEMDIE))))
2341 alloc_flags |= ALLOC_NO_WATERMARKS; 2341 alloc_flags |= ALLOC_NO_WATERMARKS;
2342 } 2342 }
2343 #ifdef CONFIG_CMA 2343 #ifdef CONFIG_CMA
2344 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2344 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2345 alloc_flags |= ALLOC_CMA; 2345 alloc_flags |= ALLOC_CMA;
2346 #endif 2346 #endif
2347 return alloc_flags; 2347 return alloc_flags;
2348 } 2348 }
2349 2349
2350 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2350 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2351 { 2351 {
2352 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2352 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2353 } 2353 }
2354 2354
2355 static inline struct page * 2355 static inline struct page *
2356 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2356 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2357 struct zonelist *zonelist, enum zone_type high_zoneidx, 2357 struct zonelist *zonelist, enum zone_type high_zoneidx,
2358 nodemask_t *nodemask, struct zone *preferred_zone, 2358 nodemask_t *nodemask, struct zone *preferred_zone,
2359 int migratetype) 2359 int migratetype)
2360 { 2360 {
2361 const gfp_t wait = gfp_mask & __GFP_WAIT; 2361 const gfp_t wait = gfp_mask & __GFP_WAIT;
2362 struct page *page = NULL; 2362 struct page *page = NULL;
2363 int alloc_flags; 2363 int alloc_flags;
2364 unsigned long pages_reclaimed = 0; 2364 unsigned long pages_reclaimed = 0;
2365 unsigned long did_some_progress; 2365 unsigned long did_some_progress;
2366 bool sync_migration = false; 2366 bool sync_migration = false;
2367 bool deferred_compaction = false; 2367 bool deferred_compaction = false;
2368 bool contended_compaction = false; 2368 bool contended_compaction = false;
2369 2369
2370 /* 2370 /*
2371 * In the slowpath, we sanity check order to avoid ever trying to 2371 * In the slowpath, we sanity check order to avoid ever trying to
2372 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2372 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2373 * be using allocators in order of preference for an area that is 2373 * be using allocators in order of preference for an area that is
2374 * too large. 2374 * too large.
2375 */ 2375 */
2376 if (order >= MAX_ORDER) { 2376 if (order >= MAX_ORDER) {
2377 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2377 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2378 return NULL; 2378 return NULL;
2379 } 2379 }
2380 2380
2381 /* 2381 /*
2382 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2382 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2383 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2383 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2384 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2384 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2385 * using a larger set of nodes after it has established that the 2385 * using a larger set of nodes after it has established that the
2386 * allowed per node queues are empty and that nodes are 2386 * allowed per node queues are empty and that nodes are
2387 * over allocated. 2387 * over allocated.
2388 */ 2388 */
2389 if (IS_ENABLED(CONFIG_NUMA) && 2389 if (IS_ENABLED(CONFIG_NUMA) &&
2390 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2390 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2391 goto nopage; 2391 goto nopage;
2392 2392
2393 restart: 2393 restart:
2394 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2394 if (!(gfp_mask & __GFP_NO_KSWAPD))
2395 wake_all_kswapd(order, zonelist, high_zoneidx, 2395 wake_all_kswapd(order, zonelist, high_zoneidx,
2396 zone_idx(preferred_zone)); 2396 zone_idx(preferred_zone));
2397 2397
2398 /* 2398 /*
2399 * OK, we're below the kswapd watermark and have kicked background 2399 * OK, we're below the kswapd watermark and have kicked background
2400 * reclaim. Now things get more complex, so set up alloc_flags according 2400 * reclaim. Now things get more complex, so set up alloc_flags according
2401 * to how we want to proceed. 2401 * to how we want to proceed.
2402 */ 2402 */
2403 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2403 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2404 2404
2405 /* 2405 /*
2406 * Find the true preferred zone if the allocation is unconstrained by 2406 * Find the true preferred zone if the allocation is unconstrained by
2407 * cpusets. 2407 * cpusets.
2408 */ 2408 */
2409 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2409 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2410 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2410 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2411 &preferred_zone); 2411 &preferred_zone);
2412 2412
2413 rebalance: 2413 rebalance:
2414 /* This is the last chance, in general, before the goto nopage. */ 2414 /* This is the last chance, in general, before the goto nopage. */
2415 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2415 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2416 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2416 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2417 preferred_zone, migratetype); 2417 preferred_zone, migratetype);
2418 if (page) 2418 if (page)
2419 goto got_pg; 2419 goto got_pg;
2420 2420
2421 /* Allocate without watermarks if the context allows */ 2421 /* Allocate without watermarks if the context allows */
2422 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2422 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2423 /* 2423 /*
2424 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2424 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2425 * the allocation is high priority and these type of 2425 * the allocation is high priority and these type of
2426 * allocations are system rather than user orientated 2426 * allocations are system rather than user orientated
2427 */ 2427 */
2428 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2428 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2429 2429
2430 page = __alloc_pages_high_priority(gfp_mask, order, 2430 page = __alloc_pages_high_priority(gfp_mask, order,
2431 zonelist, high_zoneidx, nodemask, 2431 zonelist, high_zoneidx, nodemask,
2432 preferred_zone, migratetype); 2432 preferred_zone, migratetype);
2433 if (page) { 2433 if (page) {
2434 goto got_pg; 2434 goto got_pg;
2435 } 2435 }
2436 } 2436 }
2437 2437
2438 /* Atomic allocations - we can't balance anything */ 2438 /* Atomic allocations - we can't balance anything */
2439 if (!wait) 2439 if (!wait)
2440 goto nopage; 2440 goto nopage;
2441 2441
2442 /* Avoid recursion of direct reclaim */ 2442 /* Avoid recursion of direct reclaim */
2443 if (current->flags & PF_MEMALLOC) 2443 if (current->flags & PF_MEMALLOC)
2444 goto nopage; 2444 goto nopage;
2445 2445
2446 /* Avoid allocations with no watermarks from looping endlessly */ 2446 /* Avoid allocations with no watermarks from looping endlessly */
2447 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2447 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2448 goto nopage; 2448 goto nopage;
2449 2449
2450 /* 2450 /*
2451 * Try direct compaction. The first pass is asynchronous. Subsequent 2451 * Try direct compaction. The first pass is asynchronous. Subsequent
2452 * attempts after direct reclaim are synchronous 2452 * attempts after direct reclaim are synchronous
2453 */ 2453 */
2454 page = __alloc_pages_direct_compact(gfp_mask, order, 2454 page = __alloc_pages_direct_compact(gfp_mask, order,
2455 zonelist, high_zoneidx, 2455 zonelist, high_zoneidx,
2456 nodemask, 2456 nodemask,
2457 alloc_flags, preferred_zone, 2457 alloc_flags, preferred_zone,
2458 migratetype, sync_migration, 2458 migratetype, sync_migration,
2459 &contended_compaction, 2459 &contended_compaction,
2460 &deferred_compaction, 2460 &deferred_compaction,
2461 &did_some_progress); 2461 &did_some_progress);
2462 if (page) 2462 if (page)
2463 goto got_pg; 2463 goto got_pg;
2464 sync_migration = true; 2464 sync_migration = true;
2465 2465
2466 /* 2466 /*
2467 * If compaction is deferred for high-order allocations, it is because 2467 * If compaction is deferred for high-order allocations, it is because
2468 * sync compaction recently failed. In this is the case and the caller 2468 * sync compaction recently failed. In this is the case and the caller
2469 * requested a movable allocation that does not heavily disrupt the 2469 * requested a movable allocation that does not heavily disrupt the
2470 * system then fail the allocation instead of entering direct reclaim. 2470 * system then fail the allocation instead of entering direct reclaim.
2471 */ 2471 */
2472 if ((deferred_compaction || contended_compaction) && 2472 if ((deferred_compaction || contended_compaction) &&
2473 (gfp_mask & __GFP_NO_KSWAPD)) 2473 (gfp_mask & __GFP_NO_KSWAPD))
2474 goto nopage; 2474 goto nopage;
2475 2475
2476 /* Try direct reclaim and then allocating */ 2476 /* Try direct reclaim and then allocating */
2477 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2477 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2478 zonelist, high_zoneidx, 2478 zonelist, high_zoneidx,
2479 nodemask, 2479 nodemask,
2480 alloc_flags, preferred_zone, 2480 alloc_flags, preferred_zone,
2481 migratetype, &did_some_progress); 2481 migratetype, &did_some_progress);
2482 if (page) 2482 if (page)
2483 goto got_pg; 2483 goto got_pg;
2484 2484
2485 /* 2485 /*
2486 * If we failed to make any progress reclaiming, then we are 2486 * If we failed to make any progress reclaiming, then we are
2487 * running out of options and have to consider going OOM 2487 * running out of options and have to consider going OOM
2488 */ 2488 */
2489 if (!did_some_progress) { 2489 if (!did_some_progress) {
2490 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2490 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2491 if (oom_killer_disabled) 2491 if (oom_killer_disabled)
2492 goto nopage; 2492 goto nopage;
2493 /* Coredumps can quickly deplete all memory reserves */ 2493 /* Coredumps can quickly deplete all memory reserves */
2494 if ((current->flags & PF_DUMPCORE) && 2494 if ((current->flags & PF_DUMPCORE) &&
2495 !(gfp_mask & __GFP_NOFAIL)) 2495 !(gfp_mask & __GFP_NOFAIL))
2496 goto nopage; 2496 goto nopage;
2497 page = __alloc_pages_may_oom(gfp_mask, order, 2497 page = __alloc_pages_may_oom(gfp_mask, order,
2498 zonelist, high_zoneidx, 2498 zonelist, high_zoneidx,
2499 nodemask, preferred_zone, 2499 nodemask, preferred_zone,
2500 migratetype); 2500 migratetype);
2501 if (page) 2501 if (page)
2502 goto got_pg; 2502 goto got_pg;
2503 2503
2504 if (!(gfp_mask & __GFP_NOFAIL)) { 2504 if (!(gfp_mask & __GFP_NOFAIL)) {
2505 /* 2505 /*
2506 * The oom killer is not called for high-order 2506 * The oom killer is not called for high-order
2507 * allocations that may fail, so if no progress 2507 * allocations that may fail, so if no progress
2508 * is being made, there are no other options and 2508 * is being made, there are no other options and
2509 * retrying is unlikely to help. 2509 * retrying is unlikely to help.
2510 */ 2510 */
2511 if (order > PAGE_ALLOC_COSTLY_ORDER) 2511 if (order > PAGE_ALLOC_COSTLY_ORDER)
2512 goto nopage; 2512 goto nopage;
2513 /* 2513 /*
2514 * The oom killer is not called for lowmem 2514 * The oom killer is not called for lowmem
2515 * allocations to prevent needlessly killing 2515 * allocations to prevent needlessly killing
2516 * innocent tasks. 2516 * innocent tasks.
2517 */ 2517 */
2518 if (high_zoneidx < ZONE_NORMAL) 2518 if (high_zoneidx < ZONE_NORMAL)
2519 goto nopage; 2519 goto nopage;
2520 } 2520 }
2521 2521
2522 goto restart; 2522 goto restart;
2523 } 2523 }
2524 } 2524 }
2525 2525
2526 /* Check if we should retry the allocation */ 2526 /* Check if we should retry the allocation */
2527 pages_reclaimed += did_some_progress; 2527 pages_reclaimed += did_some_progress;
2528 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2528 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2529 pages_reclaimed)) { 2529 pages_reclaimed)) {
2530 /* Wait for some write requests to complete then retry */ 2530 /* Wait for some write requests to complete then retry */
2531 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2531 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2532 goto rebalance; 2532 goto rebalance;
2533 } else { 2533 } else {
2534 /* 2534 /*
2535 * High-order allocations do not necessarily loop after 2535 * High-order allocations do not necessarily loop after
2536 * direct reclaim and reclaim/compaction depends on compaction 2536 * direct reclaim and reclaim/compaction depends on compaction
2537 * being called after reclaim so call directly if necessary 2537 * being called after reclaim so call directly if necessary
2538 */ 2538 */
2539 page = __alloc_pages_direct_compact(gfp_mask, order, 2539 page = __alloc_pages_direct_compact(gfp_mask, order,
2540 zonelist, high_zoneidx, 2540 zonelist, high_zoneidx,
2541 nodemask, 2541 nodemask,
2542 alloc_flags, preferred_zone, 2542 alloc_flags, preferred_zone,
2543 migratetype, sync_migration, 2543 migratetype, sync_migration,
2544 &contended_compaction, 2544 &contended_compaction,
2545 &deferred_compaction, 2545 &deferred_compaction,
2546 &did_some_progress); 2546 &did_some_progress);
2547 if (page) 2547 if (page)
2548 goto got_pg; 2548 goto got_pg;
2549 } 2549 }
2550 2550
2551 nopage: 2551 nopage:
2552 warn_alloc_failed(gfp_mask, order, NULL); 2552 warn_alloc_failed(gfp_mask, order, NULL);
2553 return page; 2553 return page;
2554 got_pg: 2554 got_pg:
2555 if (kmemcheck_enabled) 2555 if (kmemcheck_enabled)
2556 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2556 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2557 2557
2558 return page; 2558 return page;
2559 } 2559 }
2560 2560
2561 /* 2561 /*
2562 * This is the 'heart' of the zoned buddy allocator. 2562 * This is the 'heart' of the zoned buddy allocator.
2563 */ 2563 */
2564 struct page * 2564 struct page *
2565 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2565 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2566 struct zonelist *zonelist, nodemask_t *nodemask) 2566 struct zonelist *zonelist, nodemask_t *nodemask)
2567 { 2567 {
2568 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2568 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2569 struct zone *preferred_zone; 2569 struct zone *preferred_zone;
2570 struct page *page = NULL; 2570 struct page *page = NULL;
2571 int migratetype = allocflags_to_migratetype(gfp_mask); 2571 int migratetype = allocflags_to_migratetype(gfp_mask);
2572 unsigned int cpuset_mems_cookie; 2572 unsigned int cpuset_mems_cookie;
2573 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2573 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2574 struct mem_cgroup *memcg = NULL; 2574 struct mem_cgroup *memcg = NULL;
2575 2575
2576 gfp_mask &= gfp_allowed_mask; 2576 gfp_mask &= gfp_allowed_mask;
2577 2577
2578 lockdep_trace_alloc(gfp_mask); 2578 lockdep_trace_alloc(gfp_mask);
2579 2579
2580 might_sleep_if(gfp_mask & __GFP_WAIT); 2580 might_sleep_if(gfp_mask & __GFP_WAIT);
2581 2581
2582 if (should_fail_alloc_page(gfp_mask, order)) 2582 if (should_fail_alloc_page(gfp_mask, order))
2583 return NULL; 2583 return NULL;
2584 2584
2585 /* 2585 /*
2586 * Check the zones suitable for the gfp_mask contain at least one 2586 * Check the zones suitable for the gfp_mask contain at least one
2587 * valid zone. It's possible to have an empty zonelist as a result 2587 * valid zone. It's possible to have an empty zonelist as a result
2588 * of GFP_THISNODE and a memoryless node 2588 * of GFP_THISNODE and a memoryless node
2589 */ 2589 */
2590 if (unlikely(!zonelist->_zonerefs->zone)) 2590 if (unlikely(!zonelist->_zonerefs->zone))
2591 return NULL; 2591 return NULL;
2592 2592
2593 /* 2593 /*
2594 * Will only have any effect when __GFP_KMEMCG is set. This is 2594 * Will only have any effect when __GFP_KMEMCG is set. This is
2595 * verified in the (always inline) callee 2595 * verified in the (always inline) callee
2596 */ 2596 */
2597 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2597 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2598 return NULL; 2598 return NULL;
2599 2599
2600 retry_cpuset: 2600 retry_cpuset:
2601 cpuset_mems_cookie = get_mems_allowed(); 2601 cpuset_mems_cookie = get_mems_allowed();
2602 2602
2603 /* The preferred zone is used for statistics later */ 2603 /* The preferred zone is used for statistics later */
2604 first_zones_zonelist(zonelist, high_zoneidx, 2604 first_zones_zonelist(zonelist, high_zoneidx,
2605 nodemask ? : &cpuset_current_mems_allowed, 2605 nodemask ? : &cpuset_current_mems_allowed,
2606 &preferred_zone); 2606 &preferred_zone);
2607 if (!preferred_zone) 2607 if (!preferred_zone)
2608 goto out; 2608 goto out;
2609 2609
2610 #ifdef CONFIG_CMA 2610 #ifdef CONFIG_CMA
2611 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2611 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2612 alloc_flags |= ALLOC_CMA; 2612 alloc_flags |= ALLOC_CMA;
2613 #endif 2613 #endif
2614 /* First allocation attempt */ 2614 /* First allocation attempt */
2615 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2615 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2616 zonelist, high_zoneidx, alloc_flags, 2616 zonelist, high_zoneidx, alloc_flags,
2617 preferred_zone, migratetype); 2617 preferred_zone, migratetype);
2618 if (unlikely(!page)) 2618 if (unlikely(!page))
2619 page = __alloc_pages_slowpath(gfp_mask, order, 2619 page = __alloc_pages_slowpath(gfp_mask, order,
2620 zonelist, high_zoneidx, nodemask, 2620 zonelist, high_zoneidx, nodemask,
2621 preferred_zone, migratetype); 2621 preferred_zone, migratetype);
2622 2622
2623 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2623 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2624 2624
2625 out: 2625 out:
2626 /* 2626 /*
2627 * When updating a task's mems_allowed, it is possible to race with 2627 * When updating a task's mems_allowed, it is possible to race with
2628 * parallel threads in such a way that an allocation can fail while 2628 * parallel threads in such a way that an allocation can fail while
2629 * the mask is being updated. If a page allocation is about to fail, 2629 * the mask is being updated. If a page allocation is about to fail,
2630 * check if the cpuset changed during allocation and if so, retry. 2630 * check if the cpuset changed during allocation and if so, retry.
2631 */ 2631 */
2632 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2632 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2633 goto retry_cpuset; 2633 goto retry_cpuset;
2634 2634
2635 memcg_kmem_commit_charge(page, memcg, order); 2635 memcg_kmem_commit_charge(page, memcg, order);
2636 2636
2637 return page; 2637 return page;
2638 } 2638 }
2639 EXPORT_SYMBOL(__alloc_pages_nodemask); 2639 EXPORT_SYMBOL(__alloc_pages_nodemask);
2640 2640
2641 /* 2641 /*
2642 * Common helper functions. 2642 * Common helper functions.
2643 */ 2643 */
2644 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2644 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2645 { 2645 {
2646 struct page *page; 2646 struct page *page;
2647 2647
2648 /* 2648 /*
2649 * __get_free_pages() returns a 32-bit address, which cannot represent 2649 * __get_free_pages() returns a 32-bit address, which cannot represent
2650 * a highmem page 2650 * a highmem page
2651 */ 2651 */
2652 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2652 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2653 2653
2654 page = alloc_pages(gfp_mask, order); 2654 page = alloc_pages(gfp_mask, order);
2655 if (!page) 2655 if (!page)
2656 return 0; 2656 return 0;
2657 return (unsigned long) page_address(page); 2657 return (unsigned long) page_address(page);
2658 } 2658 }
2659 EXPORT_SYMBOL(__get_free_pages); 2659 EXPORT_SYMBOL(__get_free_pages);
2660 2660
2661 unsigned long get_zeroed_page(gfp_t gfp_mask) 2661 unsigned long get_zeroed_page(gfp_t gfp_mask)
2662 { 2662 {
2663 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2663 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2664 } 2664 }
2665 EXPORT_SYMBOL(get_zeroed_page); 2665 EXPORT_SYMBOL(get_zeroed_page);
2666 2666
2667 void __free_pages(struct page *page, unsigned int order) 2667 void __free_pages(struct page *page, unsigned int order)
2668 { 2668 {
2669 if (put_page_testzero(page)) { 2669 if (put_page_testzero(page)) {
2670 if (order == 0) 2670 if (order == 0)
2671 free_hot_cold_page(page, 0); 2671 free_hot_cold_page(page, 0);
2672 else 2672 else
2673 __free_pages_ok(page, order); 2673 __free_pages_ok(page, order);
2674 } 2674 }
2675 } 2675 }
2676 2676
2677 EXPORT_SYMBOL(__free_pages); 2677 EXPORT_SYMBOL(__free_pages);
2678 2678
2679 void free_pages(unsigned long addr, unsigned int order) 2679 void free_pages(unsigned long addr, unsigned int order)
2680 { 2680 {
2681 if (addr != 0) { 2681 if (addr != 0) {
2682 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2682 VM_BUG_ON(!virt_addr_valid((void *)addr));
2683 __free_pages(virt_to_page((void *)addr), order); 2683 __free_pages(virt_to_page((void *)addr), order);
2684 } 2684 }
2685 } 2685 }
2686 2686
2687 EXPORT_SYMBOL(free_pages); 2687 EXPORT_SYMBOL(free_pages);
2688 2688
2689 /* 2689 /*
2690 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2690 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2691 * pages allocated with __GFP_KMEMCG. 2691 * pages allocated with __GFP_KMEMCG.
2692 * 2692 *
2693 * Those pages are accounted to a particular memcg, embedded in the 2693 * Those pages are accounted to a particular memcg, embedded in the
2694 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2694 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2695 * for that information only to find out that it is NULL for users who have no 2695 * for that information only to find out that it is NULL for users who have no
2696 * interest in that whatsoever, we provide these functions. 2696 * interest in that whatsoever, we provide these functions.
2697 * 2697 *
2698 * The caller knows better which flags it relies on. 2698 * The caller knows better which flags it relies on.
2699 */ 2699 */
2700 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2700 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2701 { 2701 {
2702 memcg_kmem_uncharge_pages(page, order); 2702 memcg_kmem_uncharge_pages(page, order);
2703 __free_pages(page, order); 2703 __free_pages(page, order);
2704 } 2704 }
2705 2705
2706 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2706 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2707 { 2707 {
2708 if (addr != 0) { 2708 if (addr != 0) {
2709 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2709 VM_BUG_ON(!virt_addr_valid((void *)addr));
2710 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2710 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2711 } 2711 }
2712 } 2712 }
2713 2713
2714 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2714 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2715 { 2715 {
2716 if (addr) { 2716 if (addr) {
2717 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2717 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2718 unsigned long used = addr + PAGE_ALIGN(size); 2718 unsigned long used = addr + PAGE_ALIGN(size);
2719 2719
2720 split_page(virt_to_page((void *)addr), order); 2720 split_page(virt_to_page((void *)addr), order);
2721 while (used < alloc_end) { 2721 while (used < alloc_end) {
2722 free_page(used); 2722 free_page(used);
2723 used += PAGE_SIZE; 2723 used += PAGE_SIZE;
2724 } 2724 }
2725 } 2725 }
2726 return (void *)addr; 2726 return (void *)addr;
2727 } 2727 }
2728 2728
2729 /** 2729 /**
2730 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2730 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2731 * @size: the number of bytes to allocate 2731 * @size: the number of bytes to allocate
2732 * @gfp_mask: GFP flags for the allocation 2732 * @gfp_mask: GFP flags for the allocation
2733 * 2733 *
2734 * This function is similar to alloc_pages(), except that it allocates the 2734 * This function is similar to alloc_pages(), except that it allocates the
2735 * minimum number of pages to satisfy the request. alloc_pages() can only 2735 * minimum number of pages to satisfy the request. alloc_pages() can only
2736 * allocate memory in power-of-two pages. 2736 * allocate memory in power-of-two pages.
2737 * 2737 *
2738 * This function is also limited by MAX_ORDER. 2738 * This function is also limited by MAX_ORDER.
2739 * 2739 *
2740 * Memory allocated by this function must be released by free_pages_exact(). 2740 * Memory allocated by this function must be released by free_pages_exact().
2741 */ 2741 */
2742 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2742 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2743 { 2743 {
2744 unsigned int order = get_order(size); 2744 unsigned int order = get_order(size);
2745 unsigned long addr; 2745 unsigned long addr;
2746 2746
2747 addr = __get_free_pages(gfp_mask, order); 2747 addr = __get_free_pages(gfp_mask, order);
2748 return make_alloc_exact(addr, order, size); 2748 return make_alloc_exact(addr, order, size);
2749 } 2749 }
2750 EXPORT_SYMBOL(alloc_pages_exact); 2750 EXPORT_SYMBOL(alloc_pages_exact);
2751 2751
2752 /** 2752 /**
2753 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2753 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2754 * pages on a node. 2754 * pages on a node.
2755 * @nid: the preferred node ID where memory should be allocated 2755 * @nid: the preferred node ID where memory should be allocated
2756 * @size: the number of bytes to allocate 2756 * @size: the number of bytes to allocate
2757 * @gfp_mask: GFP flags for the allocation 2757 * @gfp_mask: GFP flags for the allocation
2758 * 2758 *
2759 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2759 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2760 * back. 2760 * back.
2761 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2761 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2762 * but is not exact. 2762 * but is not exact.
2763 */ 2763 */
2764 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2764 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2765 { 2765 {
2766 unsigned order = get_order(size); 2766 unsigned order = get_order(size);
2767 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2767 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2768 if (!p) 2768 if (!p)
2769 return NULL; 2769 return NULL;
2770 return make_alloc_exact((unsigned long)page_address(p), order, size); 2770 return make_alloc_exact((unsigned long)page_address(p), order, size);
2771 } 2771 }
2772 EXPORT_SYMBOL(alloc_pages_exact_nid); 2772 EXPORT_SYMBOL(alloc_pages_exact_nid);
2773 2773
2774 /** 2774 /**
2775 * free_pages_exact - release memory allocated via alloc_pages_exact() 2775 * free_pages_exact - release memory allocated via alloc_pages_exact()
2776 * @virt: the value returned by alloc_pages_exact. 2776 * @virt: the value returned by alloc_pages_exact.
2777 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2777 * @size: size of allocation, same value as passed to alloc_pages_exact().
2778 * 2778 *
2779 * Release the memory allocated by a previous call to alloc_pages_exact. 2779 * Release the memory allocated by a previous call to alloc_pages_exact.
2780 */ 2780 */
2781 void free_pages_exact(void *virt, size_t size) 2781 void free_pages_exact(void *virt, size_t size)
2782 { 2782 {
2783 unsigned long addr = (unsigned long)virt; 2783 unsigned long addr = (unsigned long)virt;
2784 unsigned long end = addr + PAGE_ALIGN(size); 2784 unsigned long end = addr + PAGE_ALIGN(size);
2785 2785
2786 while (addr < end) { 2786 while (addr < end) {
2787 free_page(addr); 2787 free_page(addr);
2788 addr += PAGE_SIZE; 2788 addr += PAGE_SIZE;
2789 } 2789 }
2790 } 2790 }
2791 EXPORT_SYMBOL(free_pages_exact); 2791 EXPORT_SYMBOL(free_pages_exact);
2792 2792
2793 static unsigned int nr_free_zone_pages(int offset) 2793 static unsigned int nr_free_zone_pages(int offset)
2794 { 2794 {
2795 struct zoneref *z; 2795 struct zoneref *z;
2796 struct zone *zone; 2796 struct zone *zone;
2797 2797
2798 /* Just pick one node, since fallback list is circular */ 2798 /* Just pick one node, since fallback list is circular */
2799 unsigned int sum = 0; 2799 unsigned int sum = 0;
2800 2800
2801 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2801 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2802 2802
2803 for_each_zone_zonelist(zone, z, zonelist, offset) { 2803 for_each_zone_zonelist(zone, z, zonelist, offset) {
2804 unsigned long size = zone->present_pages; 2804 unsigned long size = zone->present_pages;
2805 unsigned long high = high_wmark_pages(zone); 2805 unsigned long high = high_wmark_pages(zone);
2806 if (size > high) 2806 if (size > high)
2807 sum += size - high; 2807 sum += size - high;
2808 } 2808 }
2809 2809
2810 return sum; 2810 return sum;
2811 } 2811 }
2812 2812
2813 /* 2813 /*
2814 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2814 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
2815 */ 2815 */
2816 unsigned int nr_free_buffer_pages(void) 2816 unsigned int nr_free_buffer_pages(void)
2817 { 2817 {
2818 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2818 return nr_free_zone_pages(gfp_zone(GFP_USER));
2819 } 2819 }
2820 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2820 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2821 2821
2822 /* 2822 /*
2823 * Amount of free RAM allocatable within all zones 2823 * Amount of free RAM allocatable within all zones
2824 */ 2824 */
2825 unsigned int nr_free_pagecache_pages(void) 2825 unsigned int nr_free_pagecache_pages(void)
2826 { 2826 {
2827 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2827 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2828 } 2828 }
2829 2829
2830 static inline void show_node(struct zone *zone) 2830 static inline void show_node(struct zone *zone)
2831 { 2831 {
2832 if (IS_ENABLED(CONFIG_NUMA)) 2832 if (IS_ENABLED(CONFIG_NUMA))
2833 printk("Node %d ", zone_to_nid(zone)); 2833 printk("Node %d ", zone_to_nid(zone));
2834 } 2834 }
2835 2835
2836 void si_meminfo(struct sysinfo *val) 2836 void si_meminfo(struct sysinfo *val)
2837 { 2837 {
2838 val->totalram = totalram_pages; 2838 val->totalram = totalram_pages;
2839 val->sharedram = 0; 2839 val->sharedram = 0;
2840 val->freeram = global_page_state(NR_FREE_PAGES); 2840 val->freeram = global_page_state(NR_FREE_PAGES);
2841 val->bufferram = nr_blockdev_pages(); 2841 val->bufferram = nr_blockdev_pages();
2842 val->totalhigh = totalhigh_pages; 2842 val->totalhigh = totalhigh_pages;
2843 val->freehigh = nr_free_highpages(); 2843 val->freehigh = nr_free_highpages();
2844 val->mem_unit = PAGE_SIZE; 2844 val->mem_unit = PAGE_SIZE;
2845 } 2845 }
2846 2846
2847 EXPORT_SYMBOL(si_meminfo); 2847 EXPORT_SYMBOL(si_meminfo);
2848 2848
2849 #ifdef CONFIG_NUMA 2849 #ifdef CONFIG_NUMA
2850 void si_meminfo_node(struct sysinfo *val, int nid) 2850 void si_meminfo_node(struct sysinfo *val, int nid)
2851 { 2851 {
2852 pg_data_t *pgdat = NODE_DATA(nid); 2852 pg_data_t *pgdat = NODE_DATA(nid);
2853 2853
2854 val->totalram = pgdat->node_present_pages; 2854 val->totalram = pgdat->node_present_pages;
2855 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2855 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2856 #ifdef CONFIG_HIGHMEM 2856 #ifdef CONFIG_HIGHMEM
2857 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2857 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2858 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2858 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2859 NR_FREE_PAGES); 2859 NR_FREE_PAGES);
2860 #else 2860 #else
2861 val->totalhigh = 0; 2861 val->totalhigh = 0;
2862 val->freehigh = 0; 2862 val->freehigh = 0;
2863 #endif 2863 #endif
2864 val->mem_unit = PAGE_SIZE; 2864 val->mem_unit = PAGE_SIZE;
2865 } 2865 }
2866 #endif 2866 #endif
2867 2867
2868 /* 2868 /*
2869 * Determine whether the node should be displayed or not, depending on whether 2869 * Determine whether the node should be displayed or not, depending on whether
2870 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 2870 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2871 */ 2871 */
2872 bool skip_free_areas_node(unsigned int flags, int nid) 2872 bool skip_free_areas_node(unsigned int flags, int nid)
2873 { 2873 {
2874 bool ret = false; 2874 bool ret = false;
2875 unsigned int cpuset_mems_cookie; 2875 unsigned int cpuset_mems_cookie;
2876 2876
2877 if (!(flags & SHOW_MEM_FILTER_NODES)) 2877 if (!(flags & SHOW_MEM_FILTER_NODES))
2878 goto out; 2878 goto out;
2879 2879
2880 do { 2880 do {
2881 cpuset_mems_cookie = get_mems_allowed(); 2881 cpuset_mems_cookie = get_mems_allowed();
2882 ret = !node_isset(nid, cpuset_current_mems_allowed); 2882 ret = !node_isset(nid, cpuset_current_mems_allowed);
2883 } while (!put_mems_allowed(cpuset_mems_cookie)); 2883 } while (!put_mems_allowed(cpuset_mems_cookie));
2884 out: 2884 out:
2885 return ret; 2885 return ret;
2886 } 2886 }
2887 2887
2888 #define K(x) ((x) << (PAGE_SHIFT-10)) 2888 #define K(x) ((x) << (PAGE_SHIFT-10))
2889 2889
2890 static void show_migration_types(unsigned char type) 2890 static void show_migration_types(unsigned char type)
2891 { 2891 {
2892 static const char types[MIGRATE_TYPES] = { 2892 static const char types[MIGRATE_TYPES] = {
2893 [MIGRATE_UNMOVABLE] = 'U', 2893 [MIGRATE_UNMOVABLE] = 'U',
2894 [MIGRATE_RECLAIMABLE] = 'E', 2894 [MIGRATE_RECLAIMABLE] = 'E',
2895 [MIGRATE_MOVABLE] = 'M', 2895 [MIGRATE_MOVABLE] = 'M',
2896 [MIGRATE_RESERVE] = 'R', 2896 [MIGRATE_RESERVE] = 'R',
2897 #ifdef CONFIG_CMA 2897 #ifdef CONFIG_CMA
2898 [MIGRATE_CMA] = 'C', 2898 [MIGRATE_CMA] = 'C',
2899 #endif 2899 #endif
2900 [MIGRATE_ISOLATE] = 'I', 2900 [MIGRATE_ISOLATE] = 'I',
2901 }; 2901 };
2902 char tmp[MIGRATE_TYPES + 1]; 2902 char tmp[MIGRATE_TYPES + 1];
2903 char *p = tmp; 2903 char *p = tmp;
2904 int i; 2904 int i;
2905 2905
2906 for (i = 0; i < MIGRATE_TYPES; i++) { 2906 for (i = 0; i < MIGRATE_TYPES; i++) {
2907 if (type & (1 << i)) 2907 if (type & (1 << i))
2908 *p++ = types[i]; 2908 *p++ = types[i];
2909 } 2909 }
2910 2910
2911 *p = '\0'; 2911 *p = '\0';
2912 printk("(%s) ", tmp); 2912 printk("(%s) ", tmp);
2913 } 2913 }
2914 2914
2915 /* 2915 /*
2916 * Show free area list (used inside shift_scroll-lock stuff) 2916 * Show free area list (used inside shift_scroll-lock stuff)
2917 * We also calculate the percentage fragmentation. We do this by counting the 2917 * We also calculate the percentage fragmentation. We do this by counting the
2918 * memory on each free list with the exception of the first item on the list. 2918 * memory on each free list with the exception of the first item on the list.
2919 * Suppresses nodes that are not allowed by current's cpuset if 2919 * Suppresses nodes that are not allowed by current's cpuset if
2920 * SHOW_MEM_FILTER_NODES is passed. 2920 * SHOW_MEM_FILTER_NODES is passed.
2921 */ 2921 */
2922 void show_free_areas(unsigned int filter) 2922 void show_free_areas(unsigned int filter)
2923 { 2923 {
2924 int cpu; 2924 int cpu;
2925 struct zone *zone; 2925 struct zone *zone;
2926 2926
2927 for_each_populated_zone(zone) { 2927 for_each_populated_zone(zone) {
2928 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2928 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2929 continue; 2929 continue;
2930 show_node(zone); 2930 show_node(zone);
2931 printk("%s per-cpu:\n", zone->name); 2931 printk("%s per-cpu:\n", zone->name);
2932 2932
2933 for_each_online_cpu(cpu) { 2933 for_each_online_cpu(cpu) {
2934 struct per_cpu_pageset *pageset; 2934 struct per_cpu_pageset *pageset;
2935 2935
2936 pageset = per_cpu_ptr(zone->pageset, cpu); 2936 pageset = per_cpu_ptr(zone->pageset, cpu);
2937 2937
2938 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2938 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2939 cpu, pageset->pcp.high, 2939 cpu, pageset->pcp.high,
2940 pageset->pcp.batch, pageset->pcp.count); 2940 pageset->pcp.batch, pageset->pcp.count);
2941 } 2941 }
2942 } 2942 }
2943 2943
2944 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2944 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2945 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2945 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2946 " unevictable:%lu" 2946 " unevictable:%lu"
2947 " dirty:%lu writeback:%lu unstable:%lu\n" 2947 " dirty:%lu writeback:%lu unstable:%lu\n"
2948 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2948 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2949 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 2949 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
2950 " free_cma:%lu\n", 2950 " free_cma:%lu\n",
2951 global_page_state(NR_ACTIVE_ANON), 2951 global_page_state(NR_ACTIVE_ANON),
2952 global_page_state(NR_INACTIVE_ANON), 2952 global_page_state(NR_INACTIVE_ANON),
2953 global_page_state(NR_ISOLATED_ANON), 2953 global_page_state(NR_ISOLATED_ANON),
2954 global_page_state(NR_ACTIVE_FILE), 2954 global_page_state(NR_ACTIVE_FILE),
2955 global_page_state(NR_INACTIVE_FILE), 2955 global_page_state(NR_INACTIVE_FILE),
2956 global_page_state(NR_ISOLATED_FILE), 2956 global_page_state(NR_ISOLATED_FILE),
2957 global_page_state(NR_UNEVICTABLE), 2957 global_page_state(NR_UNEVICTABLE),
2958 global_page_state(NR_FILE_DIRTY), 2958 global_page_state(NR_FILE_DIRTY),
2959 global_page_state(NR_WRITEBACK), 2959 global_page_state(NR_WRITEBACK),
2960 global_page_state(NR_UNSTABLE_NFS), 2960 global_page_state(NR_UNSTABLE_NFS),
2961 global_page_state(NR_FREE_PAGES), 2961 global_page_state(NR_FREE_PAGES),
2962 global_page_state(NR_SLAB_RECLAIMABLE), 2962 global_page_state(NR_SLAB_RECLAIMABLE),
2963 global_page_state(NR_SLAB_UNRECLAIMABLE), 2963 global_page_state(NR_SLAB_UNRECLAIMABLE),
2964 global_page_state(NR_FILE_MAPPED), 2964 global_page_state(NR_FILE_MAPPED),
2965 global_page_state(NR_SHMEM), 2965 global_page_state(NR_SHMEM),
2966 global_page_state(NR_PAGETABLE), 2966 global_page_state(NR_PAGETABLE),
2967 global_page_state(NR_BOUNCE), 2967 global_page_state(NR_BOUNCE),
2968 global_page_state(NR_FREE_CMA_PAGES)); 2968 global_page_state(NR_FREE_CMA_PAGES));
2969 2969
2970 for_each_populated_zone(zone) { 2970 for_each_populated_zone(zone) {
2971 int i; 2971 int i;
2972 2972
2973 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2973 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2974 continue; 2974 continue;
2975 show_node(zone); 2975 show_node(zone);
2976 printk("%s" 2976 printk("%s"
2977 " free:%lukB" 2977 " free:%lukB"
2978 " min:%lukB" 2978 " min:%lukB"
2979 " low:%lukB" 2979 " low:%lukB"
2980 " high:%lukB" 2980 " high:%lukB"
2981 " active_anon:%lukB" 2981 " active_anon:%lukB"
2982 " inactive_anon:%lukB" 2982 " inactive_anon:%lukB"
2983 " active_file:%lukB" 2983 " active_file:%lukB"
2984 " inactive_file:%lukB" 2984 " inactive_file:%lukB"
2985 " unevictable:%lukB" 2985 " unevictable:%lukB"
2986 " isolated(anon):%lukB" 2986 " isolated(anon):%lukB"
2987 " isolated(file):%lukB" 2987 " isolated(file):%lukB"
2988 " present:%lukB" 2988 " present:%lukB"
2989 " managed:%lukB" 2989 " managed:%lukB"
2990 " mlocked:%lukB" 2990 " mlocked:%lukB"
2991 " dirty:%lukB" 2991 " dirty:%lukB"
2992 " writeback:%lukB" 2992 " writeback:%lukB"
2993 " mapped:%lukB" 2993 " mapped:%lukB"
2994 " shmem:%lukB" 2994 " shmem:%lukB"
2995 " slab_reclaimable:%lukB" 2995 " slab_reclaimable:%lukB"
2996 " slab_unreclaimable:%lukB" 2996 " slab_unreclaimable:%lukB"
2997 " kernel_stack:%lukB" 2997 " kernel_stack:%lukB"
2998 " pagetables:%lukB" 2998 " pagetables:%lukB"
2999 " unstable:%lukB" 2999 " unstable:%lukB"
3000 " bounce:%lukB" 3000 " bounce:%lukB"
3001 " free_cma:%lukB" 3001 " free_cma:%lukB"
3002 " writeback_tmp:%lukB" 3002 " writeback_tmp:%lukB"
3003 " pages_scanned:%lu" 3003 " pages_scanned:%lu"
3004 " all_unreclaimable? %s" 3004 " all_unreclaimable? %s"
3005 "\n", 3005 "\n",
3006 zone->name, 3006 zone->name,
3007 K(zone_page_state(zone, NR_FREE_PAGES)), 3007 K(zone_page_state(zone, NR_FREE_PAGES)),
3008 K(min_wmark_pages(zone)), 3008 K(min_wmark_pages(zone)),
3009 K(low_wmark_pages(zone)), 3009 K(low_wmark_pages(zone)),
3010 K(high_wmark_pages(zone)), 3010 K(high_wmark_pages(zone)),
3011 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3011 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3012 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3012 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3013 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3013 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3014 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3014 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3015 K(zone_page_state(zone, NR_UNEVICTABLE)), 3015 K(zone_page_state(zone, NR_UNEVICTABLE)),
3016 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3016 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3017 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3017 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3018 K(zone->present_pages), 3018 K(zone->present_pages),
3019 K(zone->managed_pages), 3019 K(zone->managed_pages),
3020 K(zone_page_state(zone, NR_MLOCK)), 3020 K(zone_page_state(zone, NR_MLOCK)),
3021 K(zone_page_state(zone, NR_FILE_DIRTY)), 3021 K(zone_page_state(zone, NR_FILE_DIRTY)),
3022 K(zone_page_state(zone, NR_WRITEBACK)), 3022 K(zone_page_state(zone, NR_WRITEBACK)),
3023 K(zone_page_state(zone, NR_FILE_MAPPED)), 3023 K(zone_page_state(zone, NR_FILE_MAPPED)),
3024 K(zone_page_state(zone, NR_SHMEM)), 3024 K(zone_page_state(zone, NR_SHMEM)),
3025 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3025 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3026 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3026 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3027 zone_page_state(zone, NR_KERNEL_STACK) * 3027 zone_page_state(zone, NR_KERNEL_STACK) *
3028 THREAD_SIZE / 1024, 3028 THREAD_SIZE / 1024,
3029 K(zone_page_state(zone, NR_PAGETABLE)), 3029 K(zone_page_state(zone, NR_PAGETABLE)),
3030 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3030 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3031 K(zone_page_state(zone, NR_BOUNCE)), 3031 K(zone_page_state(zone, NR_BOUNCE)),
3032 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3032 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3033 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3033 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3034 zone->pages_scanned, 3034 zone->pages_scanned,
3035 (zone->all_unreclaimable ? "yes" : "no") 3035 (zone->all_unreclaimable ? "yes" : "no")
3036 ); 3036 );
3037 printk("lowmem_reserve[]:"); 3037 printk("lowmem_reserve[]:");
3038 for (i = 0; i < MAX_NR_ZONES; i++) 3038 for (i = 0; i < MAX_NR_ZONES; i++)
3039 printk(" %lu", zone->lowmem_reserve[i]); 3039 printk(" %lu", zone->lowmem_reserve[i]);
3040 printk("\n"); 3040 printk("\n");
3041 } 3041 }
3042 3042
3043 for_each_populated_zone(zone) { 3043 for_each_populated_zone(zone) {
3044 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3044 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3045 unsigned char types[MAX_ORDER]; 3045 unsigned char types[MAX_ORDER];
3046 3046
3047 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3047 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3048 continue; 3048 continue;
3049 show_node(zone); 3049 show_node(zone);
3050 printk("%s: ", zone->name); 3050 printk("%s: ", zone->name);
3051 3051
3052 spin_lock_irqsave(&zone->lock, flags); 3052 spin_lock_irqsave(&zone->lock, flags);
3053 for (order = 0; order < MAX_ORDER; order++) { 3053 for (order = 0; order < MAX_ORDER; order++) {
3054 struct free_area *area = &zone->free_area[order]; 3054 struct free_area *area = &zone->free_area[order];
3055 int type; 3055 int type;
3056 3056
3057 nr[order] = area->nr_free; 3057 nr[order] = area->nr_free;
3058 total += nr[order] << order; 3058 total += nr[order] << order;
3059 3059
3060 types[order] = 0; 3060 types[order] = 0;
3061 for (type = 0; type < MIGRATE_TYPES; type++) { 3061 for (type = 0; type < MIGRATE_TYPES; type++) {
3062 if (!list_empty(&area->free_list[type])) 3062 if (!list_empty(&area->free_list[type]))
3063 types[order] |= 1 << type; 3063 types[order] |= 1 << type;
3064 } 3064 }
3065 } 3065 }
3066 spin_unlock_irqrestore(&zone->lock, flags); 3066 spin_unlock_irqrestore(&zone->lock, flags);
3067 for (order = 0; order < MAX_ORDER; order++) { 3067 for (order = 0; order < MAX_ORDER; order++) {
3068 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3068 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3069 if (nr[order]) 3069 if (nr[order])
3070 show_migration_types(types[order]); 3070 show_migration_types(types[order]);
3071 } 3071 }
3072 printk("= %lukB\n", K(total)); 3072 printk("= %lukB\n", K(total));
3073 } 3073 }
3074 3074
3075 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3075 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3076 3076
3077 show_swap_cache_info(); 3077 show_swap_cache_info();
3078 } 3078 }
3079 3079
3080 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3080 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3081 { 3081 {
3082 zoneref->zone = zone; 3082 zoneref->zone = zone;
3083 zoneref->zone_idx = zone_idx(zone); 3083 zoneref->zone_idx = zone_idx(zone);
3084 } 3084 }
3085 3085
3086 /* 3086 /*
3087 * Builds allocation fallback zone lists. 3087 * Builds allocation fallback zone lists.
3088 * 3088 *
3089 * Add all populated zones of a node to the zonelist. 3089 * Add all populated zones of a node to the zonelist.
3090 */ 3090 */
3091 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3091 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3092 int nr_zones, enum zone_type zone_type) 3092 int nr_zones, enum zone_type zone_type)
3093 { 3093 {
3094 struct zone *zone; 3094 struct zone *zone;
3095 3095
3096 BUG_ON(zone_type >= MAX_NR_ZONES); 3096 BUG_ON(zone_type >= MAX_NR_ZONES);
3097 zone_type++; 3097 zone_type++;
3098 3098
3099 do { 3099 do {
3100 zone_type--; 3100 zone_type--;
3101 zone = pgdat->node_zones + zone_type; 3101 zone = pgdat->node_zones + zone_type;
3102 if (populated_zone(zone)) { 3102 if (populated_zone(zone)) {
3103 zoneref_set_zone(zone, 3103 zoneref_set_zone(zone,
3104 &zonelist->_zonerefs[nr_zones++]); 3104 &zonelist->_zonerefs[nr_zones++]);
3105 check_highest_zone(zone_type); 3105 check_highest_zone(zone_type);
3106 } 3106 }
3107 3107
3108 } while (zone_type); 3108 } while (zone_type);
3109 return nr_zones; 3109 return nr_zones;
3110 } 3110 }
3111 3111
3112 3112
3113 /* 3113 /*
3114 * zonelist_order: 3114 * zonelist_order:
3115 * 0 = automatic detection of better ordering. 3115 * 0 = automatic detection of better ordering.
3116 * 1 = order by ([node] distance, -zonetype) 3116 * 1 = order by ([node] distance, -zonetype)
3117 * 2 = order by (-zonetype, [node] distance) 3117 * 2 = order by (-zonetype, [node] distance)
3118 * 3118 *
3119 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3119 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3120 * the same zonelist. So only NUMA can configure this param. 3120 * the same zonelist. So only NUMA can configure this param.
3121 */ 3121 */
3122 #define ZONELIST_ORDER_DEFAULT 0 3122 #define ZONELIST_ORDER_DEFAULT 0
3123 #define ZONELIST_ORDER_NODE 1 3123 #define ZONELIST_ORDER_NODE 1
3124 #define ZONELIST_ORDER_ZONE 2 3124 #define ZONELIST_ORDER_ZONE 2
3125 3125
3126 /* zonelist order in the kernel. 3126 /* zonelist order in the kernel.
3127 * set_zonelist_order() will set this to NODE or ZONE. 3127 * set_zonelist_order() will set this to NODE or ZONE.
3128 */ 3128 */
3129 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3129 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3130 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3130 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3131 3131
3132 3132
3133 #ifdef CONFIG_NUMA 3133 #ifdef CONFIG_NUMA
3134 /* The value user specified ....changed by config */ 3134 /* The value user specified ....changed by config */
3135 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3135 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3136 /* string for sysctl */ 3136 /* string for sysctl */
3137 #define NUMA_ZONELIST_ORDER_LEN 16 3137 #define NUMA_ZONELIST_ORDER_LEN 16
3138 char numa_zonelist_order[16] = "default"; 3138 char numa_zonelist_order[16] = "default";
3139 3139
3140 /* 3140 /*
3141 * interface for configure zonelist ordering. 3141 * interface for configure zonelist ordering.
3142 * command line option "numa_zonelist_order" 3142 * command line option "numa_zonelist_order"
3143 * = "[dD]efault - default, automatic configuration. 3143 * = "[dD]efault - default, automatic configuration.
3144 * = "[nN]ode - order by node locality, then by zone within node 3144 * = "[nN]ode - order by node locality, then by zone within node
3145 * = "[zZ]one - order by zone, then by locality within zone 3145 * = "[zZ]one - order by zone, then by locality within zone
3146 */ 3146 */
3147 3147
3148 static int __parse_numa_zonelist_order(char *s) 3148 static int __parse_numa_zonelist_order(char *s)
3149 { 3149 {
3150 if (*s == 'd' || *s == 'D') { 3150 if (*s == 'd' || *s == 'D') {
3151 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3151 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3152 } else if (*s == 'n' || *s == 'N') { 3152 } else if (*s == 'n' || *s == 'N') {
3153 user_zonelist_order = ZONELIST_ORDER_NODE; 3153 user_zonelist_order = ZONELIST_ORDER_NODE;
3154 } else if (*s == 'z' || *s == 'Z') { 3154 } else if (*s == 'z' || *s == 'Z') {
3155 user_zonelist_order = ZONELIST_ORDER_ZONE; 3155 user_zonelist_order = ZONELIST_ORDER_ZONE;
3156 } else { 3156 } else {
3157 printk(KERN_WARNING 3157 printk(KERN_WARNING
3158 "Ignoring invalid numa_zonelist_order value: " 3158 "Ignoring invalid numa_zonelist_order value: "
3159 "%s\n", s); 3159 "%s\n", s);
3160 return -EINVAL; 3160 return -EINVAL;
3161 } 3161 }
3162 return 0; 3162 return 0;
3163 } 3163 }
3164 3164
3165 static __init int setup_numa_zonelist_order(char *s) 3165 static __init int setup_numa_zonelist_order(char *s)
3166 { 3166 {
3167 int ret; 3167 int ret;
3168 3168
3169 if (!s) 3169 if (!s)
3170 return 0; 3170 return 0;
3171 3171
3172 ret = __parse_numa_zonelist_order(s); 3172 ret = __parse_numa_zonelist_order(s);
3173 if (ret == 0) 3173 if (ret == 0)
3174 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3174 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3175 3175
3176 return ret; 3176 return ret;
3177 } 3177 }
3178 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3178 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3179 3179
3180 /* 3180 /*
3181 * sysctl handler for numa_zonelist_order 3181 * sysctl handler for numa_zonelist_order
3182 */ 3182 */
3183 int numa_zonelist_order_handler(ctl_table *table, int write, 3183 int numa_zonelist_order_handler(ctl_table *table, int write,
3184 void __user *buffer, size_t *length, 3184 void __user *buffer, size_t *length,
3185 loff_t *ppos) 3185 loff_t *ppos)
3186 { 3186 {
3187 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3187 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3188 int ret; 3188 int ret;
3189 static DEFINE_MUTEX(zl_order_mutex); 3189 static DEFINE_MUTEX(zl_order_mutex);
3190 3190
3191 mutex_lock(&zl_order_mutex); 3191 mutex_lock(&zl_order_mutex);
3192 if (write) 3192 if (write)
3193 strcpy(saved_string, (char*)table->data); 3193 strcpy(saved_string, (char*)table->data);
3194 ret = proc_dostring(table, write, buffer, length, ppos); 3194 ret = proc_dostring(table, write, buffer, length, ppos);
3195 if (ret) 3195 if (ret)
3196 goto out; 3196 goto out;
3197 if (write) { 3197 if (write) {
3198 int oldval = user_zonelist_order; 3198 int oldval = user_zonelist_order;
3199 if (__parse_numa_zonelist_order((char*)table->data)) { 3199 if (__parse_numa_zonelist_order((char*)table->data)) {
3200 /* 3200 /*
3201 * bogus value. restore saved string 3201 * bogus value. restore saved string
3202 */ 3202 */
3203 strncpy((char*)table->data, saved_string, 3203 strncpy((char*)table->data, saved_string,
3204 NUMA_ZONELIST_ORDER_LEN); 3204 NUMA_ZONELIST_ORDER_LEN);
3205 user_zonelist_order = oldval; 3205 user_zonelist_order = oldval;
3206 } else if (oldval != user_zonelist_order) { 3206 } else if (oldval != user_zonelist_order) {
3207 mutex_lock(&zonelists_mutex); 3207 mutex_lock(&zonelists_mutex);
3208 build_all_zonelists(NULL, NULL); 3208 build_all_zonelists(NULL, NULL);
3209 mutex_unlock(&zonelists_mutex); 3209 mutex_unlock(&zonelists_mutex);
3210 } 3210 }
3211 } 3211 }
3212 out: 3212 out:
3213 mutex_unlock(&zl_order_mutex); 3213 mutex_unlock(&zl_order_mutex);
3214 return ret; 3214 return ret;
3215 } 3215 }
3216 3216
3217 3217
3218 #define MAX_NODE_LOAD (nr_online_nodes) 3218 #define MAX_NODE_LOAD (nr_online_nodes)
3219 static int node_load[MAX_NUMNODES]; 3219 static int node_load[MAX_NUMNODES];
3220 3220
3221 /** 3221 /**
3222 * find_next_best_node - find the next node that should appear in a given node's fallback list 3222 * find_next_best_node - find the next node that should appear in a given node's fallback list
3223 * @node: node whose fallback list we're appending 3223 * @node: node whose fallback list we're appending
3224 * @used_node_mask: nodemask_t of already used nodes 3224 * @used_node_mask: nodemask_t of already used nodes
3225 * 3225 *
3226 * We use a number of factors to determine which is the next node that should 3226 * We use a number of factors to determine which is the next node that should
3227 * appear on a given node's fallback list. The node should not have appeared 3227 * appear on a given node's fallback list. The node should not have appeared
3228 * already in @node's fallback list, and it should be the next closest node 3228 * already in @node's fallback list, and it should be the next closest node
3229 * according to the distance array (which contains arbitrary distance values 3229 * according to the distance array (which contains arbitrary distance values
3230 * from each node to each node in the system), and should also prefer nodes 3230 * from each node to each node in the system), and should also prefer nodes
3231 * with no CPUs, since presumably they'll have very little allocation pressure 3231 * with no CPUs, since presumably they'll have very little allocation pressure
3232 * on them otherwise. 3232 * on them otherwise.
3233 * It returns -1 if no node is found. 3233 * It returns -1 if no node is found.
3234 */ 3234 */
3235 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3235 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3236 { 3236 {
3237 int n, val; 3237 int n, val;
3238 int min_val = INT_MAX; 3238 int min_val = INT_MAX;
3239 int best_node = -1; 3239 int best_node = -1;
3240 const struct cpumask *tmp = cpumask_of_node(0); 3240 const struct cpumask *tmp = cpumask_of_node(0);
3241 3241
3242 /* Use the local node if we haven't already */ 3242 /* Use the local node if we haven't already */
3243 if (!node_isset(node, *used_node_mask)) { 3243 if (!node_isset(node, *used_node_mask)) {
3244 node_set(node, *used_node_mask); 3244 node_set(node, *used_node_mask);
3245 return node; 3245 return node;
3246 } 3246 }
3247 3247
3248 for_each_node_state(n, N_MEMORY) { 3248 for_each_node_state(n, N_MEMORY) {
3249 3249
3250 /* Don't want a node to appear more than once */ 3250 /* Don't want a node to appear more than once */
3251 if (node_isset(n, *used_node_mask)) 3251 if (node_isset(n, *used_node_mask))
3252 continue; 3252 continue;
3253 3253
3254 /* Use the distance array to find the distance */ 3254 /* Use the distance array to find the distance */
3255 val = node_distance(node, n); 3255 val = node_distance(node, n);
3256 3256
3257 /* Penalize nodes under us ("prefer the next node") */ 3257 /* Penalize nodes under us ("prefer the next node") */
3258 val += (n < node); 3258 val += (n < node);
3259 3259
3260 /* Give preference to headless and unused nodes */ 3260 /* Give preference to headless and unused nodes */
3261 tmp = cpumask_of_node(n); 3261 tmp = cpumask_of_node(n);
3262 if (!cpumask_empty(tmp)) 3262 if (!cpumask_empty(tmp))
3263 val += PENALTY_FOR_NODE_WITH_CPUS; 3263 val += PENALTY_FOR_NODE_WITH_CPUS;
3264 3264
3265 /* Slight preference for less loaded node */ 3265 /* Slight preference for less loaded node */
3266 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3266 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3267 val += node_load[n]; 3267 val += node_load[n];
3268 3268
3269 if (val < min_val) { 3269 if (val < min_val) {
3270 min_val = val; 3270 min_val = val;
3271 best_node = n; 3271 best_node = n;
3272 } 3272 }
3273 } 3273 }
3274 3274
3275 if (best_node >= 0) 3275 if (best_node >= 0)
3276 node_set(best_node, *used_node_mask); 3276 node_set(best_node, *used_node_mask);
3277 3277
3278 return best_node; 3278 return best_node;
3279 } 3279 }
3280 3280
3281 3281
3282 /* 3282 /*
3283 * Build zonelists ordered by node and zones within node. 3283 * Build zonelists ordered by node and zones within node.
3284 * This results in maximum locality--normal zone overflows into local 3284 * This results in maximum locality--normal zone overflows into local
3285 * DMA zone, if any--but risks exhausting DMA zone. 3285 * DMA zone, if any--but risks exhausting DMA zone.
3286 */ 3286 */
3287 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3287 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3288 { 3288 {
3289 int j; 3289 int j;
3290 struct zonelist *zonelist; 3290 struct zonelist *zonelist;
3291 3291
3292 zonelist = &pgdat->node_zonelists[0]; 3292 zonelist = &pgdat->node_zonelists[0];
3293 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3293 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3294 ; 3294 ;
3295 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3295 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3296 MAX_NR_ZONES - 1); 3296 MAX_NR_ZONES - 1);
3297 zonelist->_zonerefs[j].zone = NULL; 3297 zonelist->_zonerefs[j].zone = NULL;
3298 zonelist->_zonerefs[j].zone_idx = 0; 3298 zonelist->_zonerefs[j].zone_idx = 0;
3299 } 3299 }
3300 3300
3301 /* 3301 /*
3302 * Build gfp_thisnode zonelists 3302 * Build gfp_thisnode zonelists
3303 */ 3303 */
3304 static void build_thisnode_zonelists(pg_data_t *pgdat) 3304 static void build_thisnode_zonelists(pg_data_t *pgdat)
3305 { 3305 {
3306 int j; 3306 int j;
3307 struct zonelist *zonelist; 3307 struct zonelist *zonelist;
3308 3308
3309 zonelist = &pgdat->node_zonelists[1]; 3309 zonelist = &pgdat->node_zonelists[1];
3310 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3310 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3311 zonelist->_zonerefs[j].zone = NULL; 3311 zonelist->_zonerefs[j].zone = NULL;
3312 zonelist->_zonerefs[j].zone_idx = 0; 3312 zonelist->_zonerefs[j].zone_idx = 0;
3313 } 3313 }
3314 3314
3315 /* 3315 /*
3316 * Build zonelists ordered by zone and nodes within zones. 3316 * Build zonelists ordered by zone and nodes within zones.
3317 * This results in conserving DMA zone[s] until all Normal memory is 3317 * This results in conserving DMA zone[s] until all Normal memory is
3318 * exhausted, but results in overflowing to remote node while memory 3318 * exhausted, but results in overflowing to remote node while memory
3319 * may still exist in local DMA zone. 3319 * may still exist in local DMA zone.
3320 */ 3320 */
3321 static int node_order[MAX_NUMNODES]; 3321 static int node_order[MAX_NUMNODES];
3322 3322
3323 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3323 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3324 { 3324 {
3325 int pos, j, node; 3325 int pos, j, node;
3326 int zone_type; /* needs to be signed */ 3326 int zone_type; /* needs to be signed */
3327 struct zone *z; 3327 struct zone *z;
3328 struct zonelist *zonelist; 3328 struct zonelist *zonelist;
3329 3329
3330 zonelist = &pgdat->node_zonelists[0]; 3330 zonelist = &pgdat->node_zonelists[0];
3331 pos = 0; 3331 pos = 0;
3332 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3332 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3333 for (j = 0; j < nr_nodes; j++) { 3333 for (j = 0; j < nr_nodes; j++) {
3334 node = node_order[j]; 3334 node = node_order[j];
3335 z = &NODE_DATA(node)->node_zones[zone_type]; 3335 z = &NODE_DATA(node)->node_zones[zone_type];
3336 if (populated_zone(z)) { 3336 if (populated_zone(z)) {
3337 zoneref_set_zone(z, 3337 zoneref_set_zone(z,
3338 &zonelist->_zonerefs[pos++]); 3338 &zonelist->_zonerefs[pos++]);
3339 check_highest_zone(zone_type); 3339 check_highest_zone(zone_type);
3340 } 3340 }
3341 } 3341 }
3342 } 3342 }
3343 zonelist->_zonerefs[pos].zone = NULL; 3343 zonelist->_zonerefs[pos].zone = NULL;
3344 zonelist->_zonerefs[pos].zone_idx = 0; 3344 zonelist->_zonerefs[pos].zone_idx = 0;
3345 } 3345 }
3346 3346
3347 static int default_zonelist_order(void) 3347 static int default_zonelist_order(void)
3348 { 3348 {
3349 int nid, zone_type; 3349 int nid, zone_type;
3350 unsigned long low_kmem_size,total_size; 3350 unsigned long low_kmem_size,total_size;
3351 struct zone *z; 3351 struct zone *z;
3352 int average_size; 3352 int average_size;
3353 /* 3353 /*
3354 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3354 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3355 * If they are really small and used heavily, the system can fall 3355 * If they are really small and used heavily, the system can fall
3356 * into OOM very easily. 3356 * into OOM very easily.
3357 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3357 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3358 */ 3358 */
3359 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3359 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3360 low_kmem_size = 0; 3360 low_kmem_size = 0;
3361 total_size = 0; 3361 total_size = 0;
3362 for_each_online_node(nid) { 3362 for_each_online_node(nid) {
3363 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3363 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3364 z = &NODE_DATA(nid)->node_zones[zone_type]; 3364 z = &NODE_DATA(nid)->node_zones[zone_type];
3365 if (populated_zone(z)) { 3365 if (populated_zone(z)) {
3366 if (zone_type < ZONE_NORMAL) 3366 if (zone_type < ZONE_NORMAL)
3367 low_kmem_size += z->present_pages; 3367 low_kmem_size += z->present_pages;
3368 total_size += z->present_pages; 3368 total_size += z->present_pages;
3369 } else if (zone_type == ZONE_NORMAL) { 3369 } else if (zone_type == ZONE_NORMAL) {
3370 /* 3370 /*
3371 * If any node has only lowmem, then node order 3371 * If any node has only lowmem, then node order
3372 * is preferred to allow kernel allocations 3372 * is preferred to allow kernel allocations
3373 * locally; otherwise, they can easily infringe 3373 * locally; otherwise, they can easily infringe
3374 * on other nodes when there is an abundance of 3374 * on other nodes when there is an abundance of
3375 * lowmem available to allocate from. 3375 * lowmem available to allocate from.
3376 */ 3376 */
3377 return ZONELIST_ORDER_NODE; 3377 return ZONELIST_ORDER_NODE;
3378 } 3378 }
3379 } 3379 }
3380 } 3380 }
3381 if (!low_kmem_size || /* there are no DMA area. */ 3381 if (!low_kmem_size || /* there are no DMA area. */
3382 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3382 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3383 return ZONELIST_ORDER_NODE; 3383 return ZONELIST_ORDER_NODE;
3384 /* 3384 /*
3385 * look into each node's config. 3385 * look into each node's config.
3386 * If there is a node whose DMA/DMA32 memory is very big area on 3386 * If there is a node whose DMA/DMA32 memory is very big area on
3387 * local memory, NODE_ORDER may be suitable. 3387 * local memory, NODE_ORDER may be suitable.
3388 */ 3388 */
3389 average_size = total_size / 3389 average_size = total_size /
3390 (nodes_weight(node_states[N_MEMORY]) + 1); 3390 (nodes_weight(node_states[N_MEMORY]) + 1);
3391 for_each_online_node(nid) { 3391 for_each_online_node(nid) {
3392 low_kmem_size = 0; 3392 low_kmem_size = 0;
3393 total_size = 0; 3393 total_size = 0;
3394 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3394 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3395 z = &NODE_DATA(nid)->node_zones[zone_type]; 3395 z = &NODE_DATA(nid)->node_zones[zone_type];
3396 if (populated_zone(z)) { 3396 if (populated_zone(z)) {
3397 if (zone_type < ZONE_NORMAL) 3397 if (zone_type < ZONE_NORMAL)
3398 low_kmem_size += z->present_pages; 3398 low_kmem_size += z->present_pages;
3399 total_size += z->present_pages; 3399 total_size += z->present_pages;
3400 } 3400 }
3401 } 3401 }
3402 if (low_kmem_size && 3402 if (low_kmem_size &&
3403 total_size > average_size && /* ignore small node */ 3403 total_size > average_size && /* ignore small node */
3404 low_kmem_size > total_size * 70/100) 3404 low_kmem_size > total_size * 70/100)
3405 return ZONELIST_ORDER_NODE; 3405 return ZONELIST_ORDER_NODE;
3406 } 3406 }
3407 return ZONELIST_ORDER_ZONE; 3407 return ZONELIST_ORDER_ZONE;
3408 } 3408 }
3409 3409
3410 static void set_zonelist_order(void) 3410 static void set_zonelist_order(void)
3411 { 3411 {
3412 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3412 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3413 current_zonelist_order = default_zonelist_order(); 3413 current_zonelist_order = default_zonelist_order();
3414 else 3414 else
3415 current_zonelist_order = user_zonelist_order; 3415 current_zonelist_order = user_zonelist_order;
3416 } 3416 }
3417 3417
3418 static void build_zonelists(pg_data_t *pgdat) 3418 static void build_zonelists(pg_data_t *pgdat)
3419 { 3419 {
3420 int j, node, load; 3420 int j, node, load;
3421 enum zone_type i; 3421 enum zone_type i;
3422 nodemask_t used_mask; 3422 nodemask_t used_mask;
3423 int local_node, prev_node; 3423 int local_node, prev_node;
3424 struct zonelist *zonelist; 3424 struct zonelist *zonelist;
3425 int order = current_zonelist_order; 3425 int order = current_zonelist_order;
3426 3426
3427 /* initialize zonelists */ 3427 /* initialize zonelists */
3428 for (i = 0; i < MAX_ZONELISTS; i++) { 3428 for (i = 0; i < MAX_ZONELISTS; i++) {
3429 zonelist = pgdat->node_zonelists + i; 3429 zonelist = pgdat->node_zonelists + i;
3430 zonelist->_zonerefs[0].zone = NULL; 3430 zonelist->_zonerefs[0].zone = NULL;
3431 zonelist->_zonerefs[0].zone_idx = 0; 3431 zonelist->_zonerefs[0].zone_idx = 0;
3432 } 3432 }
3433 3433
3434 /* NUMA-aware ordering of nodes */ 3434 /* NUMA-aware ordering of nodes */
3435 local_node = pgdat->node_id; 3435 local_node = pgdat->node_id;
3436 load = nr_online_nodes; 3436 load = nr_online_nodes;
3437 prev_node = local_node; 3437 prev_node = local_node;
3438 nodes_clear(used_mask); 3438 nodes_clear(used_mask);
3439 3439
3440 memset(node_order, 0, sizeof(node_order)); 3440 memset(node_order, 0, sizeof(node_order));
3441 j = 0; 3441 j = 0;
3442 3442
3443 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3443 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3444 /* 3444 /*
3445 * We don't want to pressure a particular node. 3445 * We don't want to pressure a particular node.
3446 * So adding penalty to the first node in same 3446 * So adding penalty to the first node in same
3447 * distance group to make it round-robin. 3447 * distance group to make it round-robin.
3448 */ 3448 */
3449 if (node_distance(local_node, node) != 3449 if (node_distance(local_node, node) !=
3450 node_distance(local_node, prev_node)) 3450 node_distance(local_node, prev_node))
3451 node_load[node] = load; 3451 node_load[node] = load;
3452 3452
3453 prev_node = node; 3453 prev_node = node;
3454 load--; 3454 load--;
3455 if (order == ZONELIST_ORDER_NODE) 3455 if (order == ZONELIST_ORDER_NODE)
3456 build_zonelists_in_node_order(pgdat, node); 3456 build_zonelists_in_node_order(pgdat, node);
3457 else 3457 else
3458 node_order[j++] = node; /* remember order */ 3458 node_order[j++] = node; /* remember order */
3459 } 3459 }
3460 3460
3461 if (order == ZONELIST_ORDER_ZONE) { 3461 if (order == ZONELIST_ORDER_ZONE) {
3462 /* calculate node order -- i.e., DMA last! */ 3462 /* calculate node order -- i.e., DMA last! */
3463 build_zonelists_in_zone_order(pgdat, j); 3463 build_zonelists_in_zone_order(pgdat, j);
3464 } 3464 }
3465 3465
3466 build_thisnode_zonelists(pgdat); 3466 build_thisnode_zonelists(pgdat);
3467 } 3467 }
3468 3468
3469 /* Construct the zonelist performance cache - see further mmzone.h */ 3469 /* Construct the zonelist performance cache - see further mmzone.h */
3470 static void build_zonelist_cache(pg_data_t *pgdat) 3470 static void build_zonelist_cache(pg_data_t *pgdat)
3471 { 3471 {
3472 struct zonelist *zonelist; 3472 struct zonelist *zonelist;
3473 struct zonelist_cache *zlc; 3473 struct zonelist_cache *zlc;
3474 struct zoneref *z; 3474 struct zoneref *z;
3475 3475
3476 zonelist = &pgdat->node_zonelists[0]; 3476 zonelist = &pgdat->node_zonelists[0];
3477 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3477 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3478 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3478 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3479 for (z = zonelist->_zonerefs; z->zone; z++) 3479 for (z = zonelist->_zonerefs; z->zone; z++)
3480 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3480 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3481 } 3481 }
3482 3482
3483 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3483 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3484 /* 3484 /*
3485 * Return node id of node used for "local" allocations. 3485 * Return node id of node used for "local" allocations.
3486 * I.e., first node id of first zone in arg node's generic zonelist. 3486 * I.e., first node id of first zone in arg node's generic zonelist.
3487 * Used for initializing percpu 'numa_mem', which is used primarily 3487 * Used for initializing percpu 'numa_mem', which is used primarily
3488 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3488 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3489 */ 3489 */
3490 int local_memory_node(int node) 3490 int local_memory_node(int node)
3491 { 3491 {
3492 struct zone *zone; 3492 struct zone *zone;
3493 3493
3494 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3494 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3495 gfp_zone(GFP_KERNEL), 3495 gfp_zone(GFP_KERNEL),
3496 NULL, 3496 NULL,
3497 &zone); 3497 &zone);
3498 return zone->node; 3498 return zone->node;
3499 } 3499 }
3500 #endif 3500 #endif
3501 3501
3502 #else /* CONFIG_NUMA */ 3502 #else /* CONFIG_NUMA */
3503 3503
3504 static void set_zonelist_order(void) 3504 static void set_zonelist_order(void)
3505 { 3505 {
3506 current_zonelist_order = ZONELIST_ORDER_ZONE; 3506 current_zonelist_order = ZONELIST_ORDER_ZONE;
3507 } 3507 }
3508 3508
3509 static void build_zonelists(pg_data_t *pgdat) 3509 static void build_zonelists(pg_data_t *pgdat)
3510 { 3510 {
3511 int node, local_node; 3511 int node, local_node;
3512 enum zone_type j; 3512 enum zone_type j;
3513 struct zonelist *zonelist; 3513 struct zonelist *zonelist;
3514 3514
3515 local_node = pgdat->node_id; 3515 local_node = pgdat->node_id;
3516 3516
3517 zonelist = &pgdat->node_zonelists[0]; 3517 zonelist = &pgdat->node_zonelists[0];
3518 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3518 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3519 3519
3520 /* 3520 /*
3521 * Now we build the zonelist so that it contains the zones 3521 * Now we build the zonelist so that it contains the zones
3522 * of all the other nodes. 3522 * of all the other nodes.
3523 * We don't want to pressure a particular node, so when 3523 * We don't want to pressure a particular node, so when
3524 * building the zones for node N, we make sure that the 3524 * building the zones for node N, we make sure that the
3525 * zones coming right after the local ones are those from 3525 * zones coming right after the local ones are those from
3526 * node N+1 (modulo N) 3526 * node N+1 (modulo N)
3527 */ 3527 */
3528 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3528 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3529 if (!node_online(node)) 3529 if (!node_online(node))
3530 continue; 3530 continue;
3531 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3531 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3532 MAX_NR_ZONES - 1); 3532 MAX_NR_ZONES - 1);
3533 } 3533 }
3534 for (node = 0; node < local_node; node++) { 3534 for (node = 0; node < local_node; node++) {
3535 if (!node_online(node)) 3535 if (!node_online(node))
3536 continue; 3536 continue;
3537 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3537 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3538 MAX_NR_ZONES - 1); 3538 MAX_NR_ZONES - 1);
3539 } 3539 }
3540 3540
3541 zonelist->_zonerefs[j].zone = NULL; 3541 zonelist->_zonerefs[j].zone = NULL;
3542 zonelist->_zonerefs[j].zone_idx = 0; 3542 zonelist->_zonerefs[j].zone_idx = 0;
3543 } 3543 }
3544 3544
3545 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3545 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3546 static void build_zonelist_cache(pg_data_t *pgdat) 3546 static void build_zonelist_cache(pg_data_t *pgdat)
3547 { 3547 {
3548 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3548 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3549 } 3549 }
3550 3550
3551 #endif /* CONFIG_NUMA */ 3551 #endif /* CONFIG_NUMA */
3552 3552
3553 /* 3553 /*
3554 * Boot pageset table. One per cpu which is going to be used for all 3554 * Boot pageset table. One per cpu which is going to be used for all
3555 * zones and all nodes. The parameters will be set in such a way 3555 * zones and all nodes. The parameters will be set in such a way
3556 * that an item put on a list will immediately be handed over to 3556 * that an item put on a list will immediately be handed over to
3557 * the buddy list. This is safe since pageset manipulation is done 3557 * the buddy list. This is safe since pageset manipulation is done
3558 * with interrupts disabled. 3558 * with interrupts disabled.
3559 * 3559 *
3560 * The boot_pagesets must be kept even after bootup is complete for 3560 * The boot_pagesets must be kept even after bootup is complete for
3561 * unused processors and/or zones. They do play a role for bootstrapping 3561 * unused processors and/or zones. They do play a role for bootstrapping
3562 * hotplugged processors. 3562 * hotplugged processors.
3563 * 3563 *
3564 * zoneinfo_show() and maybe other functions do 3564 * zoneinfo_show() and maybe other functions do
3565 * not check if the processor is online before following the pageset pointer. 3565 * not check if the processor is online before following the pageset pointer.
3566 * Other parts of the kernel may not check if the zone is available. 3566 * Other parts of the kernel may not check if the zone is available.
3567 */ 3567 */
3568 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3568 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3569 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3569 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3570 static void setup_zone_pageset(struct zone *zone); 3570 static void setup_zone_pageset(struct zone *zone);
3571 3571
3572 /* 3572 /*
3573 * Global mutex to protect against size modification of zonelists 3573 * Global mutex to protect against size modification of zonelists
3574 * as well as to serialize pageset setup for the new populated zone. 3574 * as well as to serialize pageset setup for the new populated zone.
3575 */ 3575 */
3576 DEFINE_MUTEX(zonelists_mutex); 3576 DEFINE_MUTEX(zonelists_mutex);
3577 3577
3578 /* return values int ....just for stop_machine() */ 3578 /* return values int ....just for stop_machine() */
3579 static int __build_all_zonelists(void *data) 3579 static int __build_all_zonelists(void *data)
3580 { 3580 {
3581 int nid; 3581 int nid;
3582 int cpu; 3582 int cpu;
3583 pg_data_t *self = data; 3583 pg_data_t *self = data;
3584 3584
3585 #ifdef CONFIG_NUMA 3585 #ifdef CONFIG_NUMA
3586 memset(node_load, 0, sizeof(node_load)); 3586 memset(node_load, 0, sizeof(node_load));
3587 #endif 3587 #endif
3588 3588
3589 if (self && !node_online(self->node_id)) { 3589 if (self && !node_online(self->node_id)) {
3590 build_zonelists(self); 3590 build_zonelists(self);
3591 build_zonelist_cache(self); 3591 build_zonelist_cache(self);
3592 } 3592 }
3593 3593
3594 for_each_online_node(nid) { 3594 for_each_online_node(nid) {
3595 pg_data_t *pgdat = NODE_DATA(nid); 3595 pg_data_t *pgdat = NODE_DATA(nid);
3596 3596
3597 build_zonelists(pgdat); 3597 build_zonelists(pgdat);
3598 build_zonelist_cache(pgdat); 3598 build_zonelist_cache(pgdat);
3599 } 3599 }
3600 3600
3601 /* 3601 /*
3602 * Initialize the boot_pagesets that are going to be used 3602 * Initialize the boot_pagesets that are going to be used
3603 * for bootstrapping processors. The real pagesets for 3603 * for bootstrapping processors. The real pagesets for
3604 * each zone will be allocated later when the per cpu 3604 * each zone will be allocated later when the per cpu
3605 * allocator is available. 3605 * allocator is available.
3606 * 3606 *
3607 * boot_pagesets are used also for bootstrapping offline 3607 * boot_pagesets are used also for bootstrapping offline
3608 * cpus if the system is already booted because the pagesets 3608 * cpus if the system is already booted because the pagesets
3609 * are needed to initialize allocators on a specific cpu too. 3609 * are needed to initialize allocators on a specific cpu too.
3610 * F.e. the percpu allocator needs the page allocator which 3610 * F.e. the percpu allocator needs the page allocator which
3611 * needs the percpu allocator in order to allocate its pagesets 3611 * needs the percpu allocator in order to allocate its pagesets
3612 * (a chicken-egg dilemma). 3612 * (a chicken-egg dilemma).
3613 */ 3613 */
3614 for_each_possible_cpu(cpu) { 3614 for_each_possible_cpu(cpu) {
3615 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3615 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3616 3616
3617 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3617 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3618 /* 3618 /*
3619 * We now know the "local memory node" for each node-- 3619 * We now know the "local memory node" for each node--
3620 * i.e., the node of the first zone in the generic zonelist. 3620 * i.e., the node of the first zone in the generic zonelist.
3621 * Set up numa_mem percpu variable for on-line cpus. During 3621 * Set up numa_mem percpu variable for on-line cpus. During
3622 * boot, only the boot cpu should be on-line; we'll init the 3622 * boot, only the boot cpu should be on-line; we'll init the
3623 * secondary cpus' numa_mem as they come on-line. During 3623 * secondary cpus' numa_mem as they come on-line. During
3624 * node/memory hotplug, we'll fixup all on-line cpus. 3624 * node/memory hotplug, we'll fixup all on-line cpus.
3625 */ 3625 */
3626 if (cpu_online(cpu)) 3626 if (cpu_online(cpu))
3627 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3627 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3628 #endif 3628 #endif
3629 } 3629 }
3630 3630
3631 return 0; 3631 return 0;
3632 } 3632 }
3633 3633
3634 /* 3634 /*
3635 * Called with zonelists_mutex held always 3635 * Called with zonelists_mutex held always
3636 * unless system_state == SYSTEM_BOOTING. 3636 * unless system_state == SYSTEM_BOOTING.
3637 */ 3637 */
3638 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3638 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3639 { 3639 {
3640 set_zonelist_order(); 3640 set_zonelist_order();
3641 3641
3642 if (system_state == SYSTEM_BOOTING) { 3642 if (system_state == SYSTEM_BOOTING) {
3643 __build_all_zonelists(NULL); 3643 __build_all_zonelists(NULL);
3644 mminit_verify_zonelist(); 3644 mminit_verify_zonelist();
3645 cpuset_init_current_mems_allowed(); 3645 cpuset_init_current_mems_allowed();
3646 } else { 3646 } else {
3647 /* we have to stop all cpus to guarantee there is no user 3647 /* we have to stop all cpus to guarantee there is no user
3648 of zonelist */ 3648 of zonelist */
3649 #ifdef CONFIG_MEMORY_HOTPLUG 3649 #ifdef CONFIG_MEMORY_HOTPLUG
3650 if (zone) 3650 if (zone)
3651 setup_zone_pageset(zone); 3651 setup_zone_pageset(zone);
3652 #endif 3652 #endif
3653 stop_machine(__build_all_zonelists, pgdat, NULL); 3653 stop_machine(__build_all_zonelists, pgdat, NULL);
3654 /* cpuset refresh routine should be here */ 3654 /* cpuset refresh routine should be here */
3655 } 3655 }
3656 vm_total_pages = nr_free_pagecache_pages(); 3656 vm_total_pages = nr_free_pagecache_pages();
3657 /* 3657 /*
3658 * Disable grouping by mobility if the number of pages in the 3658 * Disable grouping by mobility if the number of pages in the
3659 * system is too low to allow the mechanism to work. It would be 3659 * system is too low to allow the mechanism to work. It would be
3660 * more accurate, but expensive to check per-zone. This check is 3660 * more accurate, but expensive to check per-zone. This check is
3661 * made on memory-hotadd so a system can start with mobility 3661 * made on memory-hotadd so a system can start with mobility
3662 * disabled and enable it later 3662 * disabled and enable it later
3663 */ 3663 */
3664 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3664 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3665 page_group_by_mobility_disabled = 1; 3665 page_group_by_mobility_disabled = 1;
3666 else 3666 else
3667 page_group_by_mobility_disabled = 0; 3667 page_group_by_mobility_disabled = 0;
3668 3668
3669 printk("Built %i zonelists in %s order, mobility grouping %s. " 3669 printk("Built %i zonelists in %s order, mobility grouping %s. "
3670 "Total pages: %ld\n", 3670 "Total pages: %ld\n",
3671 nr_online_nodes, 3671 nr_online_nodes,
3672 zonelist_order_name[current_zonelist_order], 3672 zonelist_order_name[current_zonelist_order],
3673 page_group_by_mobility_disabled ? "off" : "on", 3673 page_group_by_mobility_disabled ? "off" : "on",
3674 vm_total_pages); 3674 vm_total_pages);
3675 #ifdef CONFIG_NUMA 3675 #ifdef CONFIG_NUMA
3676 printk("Policy zone: %s\n", zone_names[policy_zone]); 3676 printk("Policy zone: %s\n", zone_names[policy_zone]);
3677 #endif 3677 #endif
3678 } 3678 }
3679 3679
3680 /* 3680 /*
3681 * Helper functions to size the waitqueue hash table. 3681 * Helper functions to size the waitqueue hash table.
3682 * Essentially these want to choose hash table sizes sufficiently 3682 * Essentially these want to choose hash table sizes sufficiently
3683 * large so that collisions trying to wait on pages are rare. 3683 * large so that collisions trying to wait on pages are rare.
3684 * But in fact, the number of active page waitqueues on typical 3684 * But in fact, the number of active page waitqueues on typical
3685 * systems is ridiculously low, less than 200. So this is even 3685 * systems is ridiculously low, less than 200. So this is even
3686 * conservative, even though it seems large. 3686 * conservative, even though it seems large.
3687 * 3687 *
3688 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3688 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3689 * waitqueues, i.e. the size of the waitq table given the number of pages. 3689 * waitqueues, i.e. the size of the waitq table given the number of pages.
3690 */ 3690 */
3691 #define PAGES_PER_WAITQUEUE 256 3691 #define PAGES_PER_WAITQUEUE 256
3692 3692
3693 #ifndef CONFIG_MEMORY_HOTPLUG 3693 #ifndef CONFIG_MEMORY_HOTPLUG
3694 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3694 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3695 { 3695 {
3696 unsigned long size = 1; 3696 unsigned long size = 1;
3697 3697
3698 pages /= PAGES_PER_WAITQUEUE; 3698 pages /= PAGES_PER_WAITQUEUE;
3699 3699
3700 while (size < pages) 3700 while (size < pages)
3701 size <<= 1; 3701 size <<= 1;
3702 3702
3703 /* 3703 /*
3704 * Once we have dozens or even hundreds of threads sleeping 3704 * Once we have dozens or even hundreds of threads sleeping
3705 * on IO we've got bigger problems than wait queue collision. 3705 * on IO we've got bigger problems than wait queue collision.
3706 * Limit the size of the wait table to a reasonable size. 3706 * Limit the size of the wait table to a reasonable size.
3707 */ 3707 */
3708 size = min(size, 4096UL); 3708 size = min(size, 4096UL);
3709 3709
3710 return max(size, 4UL); 3710 return max(size, 4UL);
3711 } 3711 }
3712 #else 3712 #else
3713 /* 3713 /*
3714 * A zone's size might be changed by hot-add, so it is not possible to determine 3714 * A zone's size might be changed by hot-add, so it is not possible to determine
3715 * a suitable size for its wait_table. So we use the maximum size now. 3715 * a suitable size for its wait_table. So we use the maximum size now.
3716 * 3716 *
3717 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3717 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3718 * 3718 *
3719 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3719 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3720 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3720 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3721 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3721 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3722 * 3722 *
3723 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3723 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3724 * or more by the traditional way. (See above). It equals: 3724 * or more by the traditional way. (See above). It equals:
3725 * 3725 *
3726 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3726 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3727 * ia64(16K page size) : = ( 8G + 4M)byte. 3727 * ia64(16K page size) : = ( 8G + 4M)byte.
3728 * powerpc (64K page size) : = (32G +16M)byte. 3728 * powerpc (64K page size) : = (32G +16M)byte.
3729 */ 3729 */
3730 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3730 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3731 { 3731 {
3732 return 4096UL; 3732 return 4096UL;
3733 } 3733 }
3734 #endif 3734 #endif
3735 3735
3736 /* 3736 /*
3737 * This is an integer logarithm so that shifts can be used later 3737 * This is an integer logarithm so that shifts can be used later
3738 * to extract the more random high bits from the multiplicative 3738 * to extract the more random high bits from the multiplicative
3739 * hash function before the remainder is taken. 3739 * hash function before the remainder is taken.
3740 */ 3740 */
3741 static inline unsigned long wait_table_bits(unsigned long size) 3741 static inline unsigned long wait_table_bits(unsigned long size)
3742 { 3742 {
3743 return ffz(~size); 3743 return ffz(~size);
3744 } 3744 }
3745 3745
3746 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3746 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3747 3747
3748 /* 3748 /*
3749 * Check if a pageblock contains reserved pages 3749 * Check if a pageblock contains reserved pages
3750 */ 3750 */
3751 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3751 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3752 { 3752 {
3753 unsigned long pfn; 3753 unsigned long pfn;
3754 3754
3755 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3755 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3756 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3756 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3757 return 1; 3757 return 1;
3758 } 3758 }
3759 return 0; 3759 return 0;
3760 } 3760 }
3761 3761
3762 /* 3762 /*
3763 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3763 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3764 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3764 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3765 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3765 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3766 * higher will lead to a bigger reserve which will get freed as contiguous 3766 * higher will lead to a bigger reserve which will get freed as contiguous
3767 * blocks as reclaim kicks in 3767 * blocks as reclaim kicks in
3768 */ 3768 */
3769 static void setup_zone_migrate_reserve(struct zone *zone) 3769 static void setup_zone_migrate_reserve(struct zone *zone)
3770 { 3770 {
3771 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3771 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3772 struct page *page; 3772 struct page *page;
3773 unsigned long block_migratetype; 3773 unsigned long block_migratetype;
3774 int reserve; 3774 int reserve;
3775 3775
3776 /* 3776 /*
3777 * Get the start pfn, end pfn and the number of blocks to reserve 3777 * Get the start pfn, end pfn and the number of blocks to reserve
3778 * We have to be careful to be aligned to pageblock_nr_pages to 3778 * We have to be careful to be aligned to pageblock_nr_pages to
3779 * make sure that we always check pfn_valid for the first page in 3779 * make sure that we always check pfn_valid for the first page in
3780 * the block. 3780 * the block.
3781 */ 3781 */
3782 start_pfn = zone->zone_start_pfn; 3782 start_pfn = zone->zone_start_pfn;
3783 end_pfn = start_pfn + zone->spanned_pages; 3783 end_pfn = start_pfn + zone->spanned_pages;
3784 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3784 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3785 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3785 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3786 pageblock_order; 3786 pageblock_order;
3787 3787
3788 /* 3788 /*
3789 * Reserve blocks are generally in place to help high-order atomic 3789 * Reserve blocks are generally in place to help high-order atomic
3790 * allocations that are short-lived. A min_free_kbytes value that 3790 * allocations that are short-lived. A min_free_kbytes value that
3791 * would result in more than 2 reserve blocks for atomic allocations 3791 * would result in more than 2 reserve blocks for atomic allocations
3792 * is assumed to be in place to help anti-fragmentation for the 3792 * is assumed to be in place to help anti-fragmentation for the
3793 * future allocation of hugepages at runtime. 3793 * future allocation of hugepages at runtime.
3794 */ 3794 */
3795 reserve = min(2, reserve); 3795 reserve = min(2, reserve);
3796 3796
3797 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3797 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3798 if (!pfn_valid(pfn)) 3798 if (!pfn_valid(pfn))
3799 continue; 3799 continue;
3800 page = pfn_to_page(pfn); 3800 page = pfn_to_page(pfn);
3801 3801
3802 /* Watch out for overlapping nodes */ 3802 /* Watch out for overlapping nodes */
3803 if (page_to_nid(page) != zone_to_nid(zone)) 3803 if (page_to_nid(page) != zone_to_nid(zone))
3804 continue; 3804 continue;
3805 3805
3806 block_migratetype = get_pageblock_migratetype(page); 3806 block_migratetype = get_pageblock_migratetype(page);
3807 3807
3808 /* Only test what is necessary when the reserves are not met */ 3808 /* Only test what is necessary when the reserves are not met */
3809 if (reserve > 0) { 3809 if (reserve > 0) {
3810 /* 3810 /*
3811 * Blocks with reserved pages will never free, skip 3811 * Blocks with reserved pages will never free, skip
3812 * them. 3812 * them.
3813 */ 3813 */
3814 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3814 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3815 if (pageblock_is_reserved(pfn, block_end_pfn)) 3815 if (pageblock_is_reserved(pfn, block_end_pfn))
3816 continue; 3816 continue;
3817 3817
3818 /* If this block is reserved, account for it */ 3818 /* If this block is reserved, account for it */
3819 if (block_migratetype == MIGRATE_RESERVE) { 3819 if (block_migratetype == MIGRATE_RESERVE) {
3820 reserve--; 3820 reserve--;
3821 continue; 3821 continue;
3822 } 3822 }
3823 3823
3824 /* Suitable for reserving if this block is movable */ 3824 /* Suitable for reserving if this block is movable */
3825 if (block_migratetype == MIGRATE_MOVABLE) { 3825 if (block_migratetype == MIGRATE_MOVABLE) {
3826 set_pageblock_migratetype(page, 3826 set_pageblock_migratetype(page,
3827 MIGRATE_RESERVE); 3827 MIGRATE_RESERVE);
3828 move_freepages_block(zone, page, 3828 move_freepages_block(zone, page,
3829 MIGRATE_RESERVE); 3829 MIGRATE_RESERVE);
3830 reserve--; 3830 reserve--;
3831 continue; 3831 continue;
3832 } 3832 }
3833 } 3833 }
3834 3834
3835 /* 3835 /*
3836 * If the reserve is met and this is a previous reserved block, 3836 * If the reserve is met and this is a previous reserved block,
3837 * take it back 3837 * take it back
3838 */ 3838 */
3839 if (block_migratetype == MIGRATE_RESERVE) { 3839 if (block_migratetype == MIGRATE_RESERVE) {
3840 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3840 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3841 move_freepages_block(zone, page, MIGRATE_MOVABLE); 3841 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3842 } 3842 }
3843 } 3843 }
3844 } 3844 }
3845 3845
3846 /* 3846 /*
3847 * Initially all pages are reserved - free ones are freed 3847 * Initially all pages are reserved - free ones are freed
3848 * up by free_all_bootmem() once the early boot process is 3848 * up by free_all_bootmem() once the early boot process is
3849 * done. Non-atomic initialization, single-pass. 3849 * done. Non-atomic initialization, single-pass.
3850 */ 3850 */
3851 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 3851 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3852 unsigned long start_pfn, enum memmap_context context) 3852 unsigned long start_pfn, enum memmap_context context)
3853 { 3853 {
3854 struct page *page; 3854 struct page *page;
3855 unsigned long end_pfn = start_pfn + size; 3855 unsigned long end_pfn = start_pfn + size;
3856 unsigned long pfn; 3856 unsigned long pfn;
3857 struct zone *z; 3857 struct zone *z;
3858 3858
3859 if (highest_memmap_pfn < end_pfn - 1) 3859 if (highest_memmap_pfn < end_pfn - 1)
3860 highest_memmap_pfn = end_pfn - 1; 3860 highest_memmap_pfn = end_pfn - 1;
3861 3861
3862 z = &NODE_DATA(nid)->node_zones[zone]; 3862 z = &NODE_DATA(nid)->node_zones[zone];
3863 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3863 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3864 /* 3864 /*
3865 * There can be holes in boot-time mem_map[]s 3865 * There can be holes in boot-time mem_map[]s
3866 * handed to this function. They do not 3866 * handed to this function. They do not
3867 * exist on hotplugged memory. 3867 * exist on hotplugged memory.
3868 */ 3868 */
3869 if (context == MEMMAP_EARLY) { 3869 if (context == MEMMAP_EARLY) {
3870 if (!early_pfn_valid(pfn)) 3870 if (!early_pfn_valid(pfn))
3871 continue; 3871 continue;
3872 if (!early_pfn_in_nid(pfn, nid)) 3872 if (!early_pfn_in_nid(pfn, nid))
3873 continue; 3873 continue;
3874 } 3874 }
3875 page = pfn_to_page(pfn); 3875 page = pfn_to_page(pfn);
3876 set_page_links(page, zone, nid, pfn); 3876 set_page_links(page, zone, nid, pfn);
3877 mminit_verify_page_links(page, zone, nid, pfn); 3877 mminit_verify_page_links(page, zone, nid, pfn);
3878 init_page_count(page); 3878 init_page_count(page);
3879 reset_page_mapcount(page); 3879 reset_page_mapcount(page);
3880 reset_page_last_nid(page); 3880 reset_page_last_nid(page);
3881 SetPageReserved(page); 3881 SetPageReserved(page);
3882 /* 3882 /*
3883 * Mark the block movable so that blocks are reserved for 3883 * Mark the block movable so that blocks are reserved for
3884 * movable at startup. This will force kernel allocations 3884 * movable at startup. This will force kernel allocations
3885 * to reserve their blocks rather than leaking throughout 3885 * to reserve their blocks rather than leaking throughout
3886 * the address space during boot when many long-lived 3886 * the address space during boot when many long-lived
3887 * kernel allocations are made. Later some blocks near 3887 * kernel allocations are made. Later some blocks near
3888 * the start are marked MIGRATE_RESERVE by 3888 * the start are marked MIGRATE_RESERVE by
3889 * setup_zone_migrate_reserve() 3889 * setup_zone_migrate_reserve()
3890 * 3890 *
3891 * bitmap is created for zone's valid pfn range. but memmap 3891 * bitmap is created for zone's valid pfn range. but memmap
3892 * can be created for invalid pages (for alignment) 3892 * can be created for invalid pages (for alignment)
3893 * check here not to call set_pageblock_migratetype() against 3893 * check here not to call set_pageblock_migratetype() against
3894 * pfn out of zone. 3894 * pfn out of zone.
3895 */ 3895 */
3896 if ((z->zone_start_pfn <= pfn) 3896 if ((z->zone_start_pfn <= pfn)
3897 && (pfn < z->zone_start_pfn + z->spanned_pages) 3897 && (pfn < z->zone_start_pfn + z->spanned_pages)
3898 && !(pfn & (pageblock_nr_pages - 1))) 3898 && !(pfn & (pageblock_nr_pages - 1)))
3899 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3899 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3900 3900
3901 INIT_LIST_HEAD(&page->lru); 3901 INIT_LIST_HEAD(&page->lru);
3902 #ifdef WANT_PAGE_VIRTUAL 3902 #ifdef WANT_PAGE_VIRTUAL
3903 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 3903 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
3904 if (!is_highmem_idx(zone)) 3904 if (!is_highmem_idx(zone))
3905 set_page_address(page, __va(pfn << PAGE_SHIFT)); 3905 set_page_address(page, __va(pfn << PAGE_SHIFT));
3906 #endif 3906 #endif
3907 } 3907 }
3908 } 3908 }
3909 3909
3910 static void __meminit zone_init_free_lists(struct zone *zone) 3910 static void __meminit zone_init_free_lists(struct zone *zone)
3911 { 3911 {
3912 int order, t; 3912 int order, t;
3913 for_each_migratetype_order(order, t) { 3913 for_each_migratetype_order(order, t) {
3914 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 3914 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3915 zone->free_area[order].nr_free = 0; 3915 zone->free_area[order].nr_free = 0;
3916 } 3916 }
3917 } 3917 }
3918 3918
3919 #ifndef __HAVE_ARCH_MEMMAP_INIT 3919 #ifndef __HAVE_ARCH_MEMMAP_INIT
3920 #define memmap_init(size, nid, zone, start_pfn) \ 3920 #define memmap_init(size, nid, zone, start_pfn) \
3921 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3921 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3922 #endif 3922 #endif
3923 3923
3924 static int __meminit zone_batchsize(struct zone *zone) 3924 static int __meminit zone_batchsize(struct zone *zone)
3925 { 3925 {
3926 #ifdef CONFIG_MMU 3926 #ifdef CONFIG_MMU
3927 int batch; 3927 int batch;
3928 3928
3929 /* 3929 /*
3930 * The per-cpu-pages pools are set to around 1000th of the 3930 * The per-cpu-pages pools are set to around 1000th of the
3931 * size of the zone. But no more than 1/2 of a meg. 3931 * size of the zone. But no more than 1/2 of a meg.
3932 * 3932 *
3933 * OK, so we don't know how big the cache is. So guess. 3933 * OK, so we don't know how big the cache is. So guess.
3934 */ 3934 */
3935 batch = zone->present_pages / 1024; 3935 batch = zone->present_pages / 1024;
3936 if (batch * PAGE_SIZE > 512 * 1024) 3936 if (batch * PAGE_SIZE > 512 * 1024)
3937 batch = (512 * 1024) / PAGE_SIZE; 3937 batch = (512 * 1024) / PAGE_SIZE;
3938 batch /= 4; /* We effectively *= 4 below */ 3938 batch /= 4; /* We effectively *= 4 below */
3939 if (batch < 1) 3939 if (batch < 1)
3940 batch = 1; 3940 batch = 1;
3941 3941
3942 /* 3942 /*
3943 * Clamp the batch to a 2^n - 1 value. Having a power 3943 * Clamp the batch to a 2^n - 1 value. Having a power
3944 * of 2 value was found to be more likely to have 3944 * of 2 value was found to be more likely to have
3945 * suboptimal cache aliasing properties in some cases. 3945 * suboptimal cache aliasing properties in some cases.
3946 * 3946 *
3947 * For example if 2 tasks are alternately allocating 3947 * For example if 2 tasks are alternately allocating
3948 * batches of pages, one task can end up with a lot 3948 * batches of pages, one task can end up with a lot
3949 * of pages of one half of the possible page colors 3949 * of pages of one half of the possible page colors
3950 * and the other with pages of the other colors. 3950 * and the other with pages of the other colors.
3951 */ 3951 */
3952 batch = rounddown_pow_of_two(batch + batch/2) - 1; 3952 batch = rounddown_pow_of_two(batch + batch/2) - 1;
3953 3953
3954 return batch; 3954 return batch;
3955 3955
3956 #else 3956 #else
3957 /* The deferral and batching of frees should be suppressed under NOMMU 3957 /* The deferral and batching of frees should be suppressed under NOMMU
3958 * conditions. 3958 * conditions.
3959 * 3959 *
3960 * The problem is that NOMMU needs to be able to allocate large chunks 3960 * The problem is that NOMMU needs to be able to allocate large chunks
3961 * of contiguous memory as there's no hardware page translation to 3961 * of contiguous memory as there's no hardware page translation to
3962 * assemble apparent contiguous memory from discontiguous pages. 3962 * assemble apparent contiguous memory from discontiguous pages.
3963 * 3963 *
3964 * Queueing large contiguous runs of pages for batching, however, 3964 * Queueing large contiguous runs of pages for batching, however,
3965 * causes the pages to actually be freed in smaller chunks. As there 3965 * causes the pages to actually be freed in smaller chunks. As there
3966 * can be a significant delay between the individual batches being 3966 * can be a significant delay between the individual batches being
3967 * recycled, this leads to the once large chunks of space being 3967 * recycled, this leads to the once large chunks of space being
3968 * fragmented and becoming unavailable for high-order allocations. 3968 * fragmented and becoming unavailable for high-order allocations.
3969 */ 3969 */
3970 return 0; 3970 return 0;
3971 #endif 3971 #endif
3972 } 3972 }
3973 3973
3974 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3974 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3975 { 3975 {
3976 struct per_cpu_pages *pcp; 3976 struct per_cpu_pages *pcp;
3977 int migratetype; 3977 int migratetype;
3978 3978
3979 memset(p, 0, sizeof(*p)); 3979 memset(p, 0, sizeof(*p));
3980 3980
3981 pcp = &p->pcp; 3981 pcp = &p->pcp;
3982 pcp->count = 0; 3982 pcp->count = 0;
3983 pcp->high = 6 * batch; 3983 pcp->high = 6 * batch;
3984 pcp->batch = max(1UL, 1 * batch); 3984 pcp->batch = max(1UL, 1 * batch);
3985 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 3985 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3986 INIT_LIST_HEAD(&pcp->lists[migratetype]); 3986 INIT_LIST_HEAD(&pcp->lists[migratetype]);
3987 } 3987 }
3988 3988
3989 /* 3989 /*
3990 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 3990 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
3991 * to the value high for the pageset p. 3991 * to the value high for the pageset p.
3992 */ 3992 */
3993 3993
3994 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 3994 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3995 unsigned long high) 3995 unsigned long high)
3996 { 3996 {
3997 struct per_cpu_pages *pcp; 3997 struct per_cpu_pages *pcp;
3998 3998
3999 pcp = &p->pcp; 3999 pcp = &p->pcp;
4000 pcp->high = high; 4000 pcp->high = high;
4001 pcp->batch = max(1UL, high/4); 4001 pcp->batch = max(1UL, high/4);
4002 if ((high/4) > (PAGE_SHIFT * 8)) 4002 if ((high/4) > (PAGE_SHIFT * 8))
4003 pcp->batch = PAGE_SHIFT * 8; 4003 pcp->batch = PAGE_SHIFT * 8;
4004 } 4004 }
4005 4005
4006 static void __meminit setup_zone_pageset(struct zone *zone) 4006 static void __meminit setup_zone_pageset(struct zone *zone)
4007 { 4007 {
4008 int cpu; 4008 int cpu;
4009 4009
4010 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4010 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4011 4011
4012 for_each_possible_cpu(cpu) { 4012 for_each_possible_cpu(cpu) {
4013 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4013 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4014 4014
4015 setup_pageset(pcp, zone_batchsize(zone)); 4015 setup_pageset(pcp, zone_batchsize(zone));
4016 4016
4017 if (percpu_pagelist_fraction) 4017 if (percpu_pagelist_fraction)
4018 setup_pagelist_highmark(pcp, 4018 setup_pagelist_highmark(pcp,
4019 (zone->present_pages / 4019 (zone->present_pages /
4020 percpu_pagelist_fraction)); 4020 percpu_pagelist_fraction));
4021 } 4021 }
4022 } 4022 }
4023 4023
4024 /* 4024 /*
4025 * Allocate per cpu pagesets and initialize them. 4025 * Allocate per cpu pagesets and initialize them.
4026 * Before this call only boot pagesets were available. 4026 * Before this call only boot pagesets were available.
4027 */ 4027 */
4028 void __init setup_per_cpu_pageset(void) 4028 void __init setup_per_cpu_pageset(void)
4029 { 4029 {
4030 struct zone *zone; 4030 struct zone *zone;
4031 4031
4032 for_each_populated_zone(zone) 4032 for_each_populated_zone(zone)
4033 setup_zone_pageset(zone); 4033 setup_zone_pageset(zone);
4034 } 4034 }
4035 4035
4036 static noinline __init_refok 4036 static noinline __init_refok
4037 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4037 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4038 { 4038 {
4039 int i; 4039 int i;
4040 struct pglist_data *pgdat = zone->zone_pgdat; 4040 struct pglist_data *pgdat = zone->zone_pgdat;
4041 size_t alloc_size; 4041 size_t alloc_size;
4042 4042
4043 /* 4043 /*
4044 * The per-page waitqueue mechanism uses hashed waitqueues 4044 * The per-page waitqueue mechanism uses hashed waitqueues
4045 * per zone. 4045 * per zone.
4046 */ 4046 */
4047 zone->wait_table_hash_nr_entries = 4047 zone->wait_table_hash_nr_entries =
4048 wait_table_hash_nr_entries(zone_size_pages); 4048 wait_table_hash_nr_entries(zone_size_pages);
4049 zone->wait_table_bits = 4049 zone->wait_table_bits =
4050 wait_table_bits(zone->wait_table_hash_nr_entries); 4050 wait_table_bits(zone->wait_table_hash_nr_entries);
4051 alloc_size = zone->wait_table_hash_nr_entries 4051 alloc_size = zone->wait_table_hash_nr_entries
4052 * sizeof(wait_queue_head_t); 4052 * sizeof(wait_queue_head_t);
4053 4053
4054 if (!slab_is_available()) { 4054 if (!slab_is_available()) {
4055 zone->wait_table = (wait_queue_head_t *) 4055 zone->wait_table = (wait_queue_head_t *)
4056 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4056 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4057 } else { 4057 } else {
4058 /* 4058 /*
4059 * This case means that a zone whose size was 0 gets new memory 4059 * This case means that a zone whose size was 0 gets new memory
4060 * via memory hot-add. 4060 * via memory hot-add.
4061 * But it may be the case that a new node was hot-added. In 4061 * But it may be the case that a new node was hot-added. In
4062 * this case vmalloc() will not be able to use this new node's 4062 * this case vmalloc() will not be able to use this new node's
4063 * memory - this wait_table must be initialized to use this new 4063 * memory - this wait_table must be initialized to use this new
4064 * node itself as well. 4064 * node itself as well.
4065 * To use this new node's memory, further consideration will be 4065 * To use this new node's memory, further consideration will be
4066 * necessary. 4066 * necessary.
4067 */ 4067 */
4068 zone->wait_table = vmalloc(alloc_size); 4068 zone->wait_table = vmalloc(alloc_size);
4069 } 4069 }
4070 if (!zone->wait_table) 4070 if (!zone->wait_table)
4071 return -ENOMEM; 4071 return -ENOMEM;
4072 4072
4073 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4073 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4074 init_waitqueue_head(zone->wait_table + i); 4074 init_waitqueue_head(zone->wait_table + i);
4075 4075
4076 return 0; 4076 return 0;
4077 } 4077 }
4078 4078
4079 static __meminit void zone_pcp_init(struct zone *zone) 4079 static __meminit void zone_pcp_init(struct zone *zone)
4080 { 4080 {
4081 /* 4081 /*
4082 * per cpu subsystem is not up at this point. The following code 4082 * per cpu subsystem is not up at this point. The following code
4083 * relies on the ability of the linker to provide the 4083 * relies on the ability of the linker to provide the
4084 * offset of a (static) per cpu variable into the per cpu area. 4084 * offset of a (static) per cpu variable into the per cpu area.
4085 */ 4085 */
4086 zone->pageset = &boot_pageset; 4086 zone->pageset = &boot_pageset;
4087 4087
4088 if (zone->present_pages) 4088 if (zone->present_pages)
4089 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4089 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4090 zone->name, zone->present_pages, 4090 zone->name, zone->present_pages,
4091 zone_batchsize(zone)); 4091 zone_batchsize(zone));
4092 } 4092 }
4093 4093
4094 int __meminit init_currently_empty_zone(struct zone *zone, 4094 int __meminit init_currently_empty_zone(struct zone *zone,
4095 unsigned long zone_start_pfn, 4095 unsigned long zone_start_pfn,
4096 unsigned long size, 4096 unsigned long size,
4097 enum memmap_context context) 4097 enum memmap_context context)
4098 { 4098 {
4099 struct pglist_data *pgdat = zone->zone_pgdat; 4099 struct pglist_data *pgdat = zone->zone_pgdat;
4100 int ret; 4100 int ret;
4101 ret = zone_wait_table_init(zone, size); 4101 ret = zone_wait_table_init(zone, size);
4102 if (ret) 4102 if (ret)
4103 return ret; 4103 return ret;
4104 pgdat->nr_zones = zone_idx(zone) + 1; 4104 pgdat->nr_zones = zone_idx(zone) + 1;
4105 4105
4106 zone->zone_start_pfn = zone_start_pfn; 4106 zone->zone_start_pfn = zone_start_pfn;
4107 4107
4108 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4108 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4109 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4109 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4110 pgdat->node_id, 4110 pgdat->node_id,
4111 (unsigned long)zone_idx(zone), 4111 (unsigned long)zone_idx(zone),
4112 zone_start_pfn, (zone_start_pfn + size)); 4112 zone_start_pfn, (zone_start_pfn + size));
4113 4113
4114 zone_init_free_lists(zone); 4114 zone_init_free_lists(zone);
4115 4115
4116 return 0; 4116 return 0;
4117 } 4117 }
4118 4118
4119 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4119 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4120 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4120 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4121 /* 4121 /*
4122 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4122 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4123 * Architectures may implement their own version but if add_active_range() 4123 * Architectures may implement their own version but if add_active_range()
4124 * was used and there are no special requirements, this is a convenient 4124 * was used and there are no special requirements, this is a convenient
4125 * alternative 4125 * alternative
4126 */ 4126 */
4127 int __meminit __early_pfn_to_nid(unsigned long pfn) 4127 int __meminit __early_pfn_to_nid(unsigned long pfn)
4128 { 4128 {
4129 unsigned long start_pfn, end_pfn; 4129 unsigned long start_pfn, end_pfn;
4130 int i, nid; 4130 int i, nid;
4131 4131
4132 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4132 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4133 if (start_pfn <= pfn && pfn < end_pfn) 4133 if (start_pfn <= pfn && pfn < end_pfn)
4134 return nid; 4134 return nid;
4135 /* This is a memory hole */ 4135 /* This is a memory hole */
4136 return -1; 4136 return -1;
4137 } 4137 }
4138 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4138 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4139 4139
4140 int __meminit early_pfn_to_nid(unsigned long pfn) 4140 int __meminit early_pfn_to_nid(unsigned long pfn)
4141 { 4141 {
4142 int nid; 4142 int nid;
4143 4143
4144 nid = __early_pfn_to_nid(pfn); 4144 nid = __early_pfn_to_nid(pfn);
4145 if (nid >= 0) 4145 if (nid >= 0)
4146 return nid; 4146 return nid;
4147 /* just returns 0 */ 4147 /* just returns 0 */
4148 return 0; 4148 return 0;
4149 } 4149 }
4150 4150
4151 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4151 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4152 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4152 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4153 { 4153 {
4154 int nid; 4154 int nid;
4155 4155
4156 nid = __early_pfn_to_nid(pfn); 4156 nid = __early_pfn_to_nid(pfn);
4157 if (nid >= 0 && nid != node) 4157 if (nid >= 0 && nid != node)
4158 return false; 4158 return false;
4159 return true; 4159 return true;
4160 } 4160 }
4161 #endif 4161 #endif
4162 4162
4163 /** 4163 /**
4164 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4164 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4165 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4165 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4166 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4166 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4167 * 4167 *
4168 * If an architecture guarantees that all ranges registered with 4168 * If an architecture guarantees that all ranges registered with
4169 * add_active_ranges() contain no holes and may be freed, this 4169 * add_active_ranges() contain no holes and may be freed, this
4170 * this function may be used instead of calling free_bootmem() manually. 4170 * this function may be used instead of calling free_bootmem() manually.
4171 */ 4171 */
4172 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4172 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4173 { 4173 {
4174 unsigned long start_pfn, end_pfn; 4174 unsigned long start_pfn, end_pfn;
4175 int i, this_nid; 4175 int i, this_nid;
4176 4176
4177 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4177 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4178 start_pfn = min(start_pfn, max_low_pfn); 4178 start_pfn = min(start_pfn, max_low_pfn);
4179 end_pfn = min(end_pfn, max_low_pfn); 4179 end_pfn = min(end_pfn, max_low_pfn);
4180 4180
4181 if (start_pfn < end_pfn) 4181 if (start_pfn < end_pfn)
4182 free_bootmem_node(NODE_DATA(this_nid), 4182 free_bootmem_node(NODE_DATA(this_nid),
4183 PFN_PHYS(start_pfn), 4183 PFN_PHYS(start_pfn),
4184 (end_pfn - start_pfn) << PAGE_SHIFT); 4184 (end_pfn - start_pfn) << PAGE_SHIFT);
4185 } 4185 }
4186 } 4186 }
4187 4187
4188 /** 4188 /**
4189 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4189 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4190 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4190 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4191 * 4191 *
4192 * If an architecture guarantees that all ranges registered with 4192 * If an architecture guarantees that all ranges registered with
4193 * add_active_ranges() contain no holes and may be freed, this 4193 * add_active_ranges() contain no holes and may be freed, this
4194 * function may be used instead of calling memory_present() manually. 4194 * function may be used instead of calling memory_present() manually.
4195 */ 4195 */
4196 void __init sparse_memory_present_with_active_regions(int nid) 4196 void __init sparse_memory_present_with_active_regions(int nid)
4197 { 4197 {
4198 unsigned long start_pfn, end_pfn; 4198 unsigned long start_pfn, end_pfn;
4199 int i, this_nid; 4199 int i, this_nid;
4200 4200
4201 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4201 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4202 memory_present(this_nid, start_pfn, end_pfn); 4202 memory_present(this_nid, start_pfn, end_pfn);
4203 } 4203 }
4204 4204
4205 /** 4205 /**
4206 * get_pfn_range_for_nid - Return the start and end page frames for a node 4206 * get_pfn_range_for_nid - Return the start and end page frames for a node
4207 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4207 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4208 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4208 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4209 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4209 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4210 * 4210 *
4211 * It returns the start and end page frame of a node based on information 4211 * It returns the start and end page frame of a node based on information
4212 * provided by an arch calling add_active_range(). If called for a node 4212 * provided by an arch calling add_active_range(). If called for a node
4213 * with no available memory, a warning is printed and the start and end 4213 * with no available memory, a warning is printed and the start and end
4214 * PFNs will be 0. 4214 * PFNs will be 0.
4215 */ 4215 */
4216 void __meminit get_pfn_range_for_nid(unsigned int nid, 4216 void __meminit get_pfn_range_for_nid(unsigned int nid,
4217 unsigned long *start_pfn, unsigned long *end_pfn) 4217 unsigned long *start_pfn, unsigned long *end_pfn)
4218 { 4218 {
4219 unsigned long this_start_pfn, this_end_pfn; 4219 unsigned long this_start_pfn, this_end_pfn;
4220 int i; 4220 int i;
4221 4221
4222 *start_pfn = -1UL; 4222 *start_pfn = -1UL;
4223 *end_pfn = 0; 4223 *end_pfn = 0;
4224 4224
4225 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4225 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4226 *start_pfn = min(*start_pfn, this_start_pfn); 4226 *start_pfn = min(*start_pfn, this_start_pfn);
4227 *end_pfn = max(*end_pfn, this_end_pfn); 4227 *end_pfn = max(*end_pfn, this_end_pfn);
4228 } 4228 }
4229 4229
4230 if (*start_pfn == -1UL) 4230 if (*start_pfn == -1UL)
4231 *start_pfn = 0; 4231 *start_pfn = 0;
4232 } 4232 }
4233 4233
4234 /* 4234 /*
4235 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4235 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4236 * assumption is made that zones within a node are ordered in monotonic 4236 * assumption is made that zones within a node are ordered in monotonic
4237 * increasing memory addresses so that the "highest" populated zone is used 4237 * increasing memory addresses so that the "highest" populated zone is used
4238 */ 4238 */
4239 static void __init find_usable_zone_for_movable(void) 4239 static void __init find_usable_zone_for_movable(void)
4240 { 4240 {
4241 int zone_index; 4241 int zone_index;
4242 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4242 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4243 if (zone_index == ZONE_MOVABLE) 4243 if (zone_index == ZONE_MOVABLE)
4244 continue; 4244 continue;
4245 4245
4246 if (arch_zone_highest_possible_pfn[zone_index] > 4246 if (arch_zone_highest_possible_pfn[zone_index] >
4247 arch_zone_lowest_possible_pfn[zone_index]) 4247 arch_zone_lowest_possible_pfn[zone_index])
4248 break; 4248 break;
4249 } 4249 }
4250 4250
4251 VM_BUG_ON(zone_index == -1); 4251 VM_BUG_ON(zone_index == -1);
4252 movable_zone = zone_index; 4252 movable_zone = zone_index;
4253 } 4253 }
4254 4254
4255 /* 4255 /*
4256 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4256 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4257 * because it is sized independent of architecture. Unlike the other zones, 4257 * because it is sized independent of architecture. Unlike the other zones,
4258 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4258 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4259 * in each node depending on the size of each node and how evenly kernelcore 4259 * in each node depending on the size of each node and how evenly kernelcore
4260 * is distributed. This helper function adjusts the zone ranges 4260 * is distributed. This helper function adjusts the zone ranges
4261 * provided by the architecture for a given node by using the end of the 4261 * provided by the architecture for a given node by using the end of the
4262 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4262 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4263 * zones within a node are in order of monotonic increases memory addresses 4263 * zones within a node are in order of monotonic increases memory addresses
4264 */ 4264 */
4265 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4265 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4266 unsigned long zone_type, 4266 unsigned long zone_type,
4267 unsigned long node_start_pfn, 4267 unsigned long node_start_pfn,
4268 unsigned long node_end_pfn, 4268 unsigned long node_end_pfn,
4269 unsigned long *zone_start_pfn, 4269 unsigned long *zone_start_pfn,
4270 unsigned long *zone_end_pfn) 4270 unsigned long *zone_end_pfn)
4271 { 4271 {
4272 /* Only adjust if ZONE_MOVABLE is on this node */ 4272 /* Only adjust if ZONE_MOVABLE is on this node */
4273 if (zone_movable_pfn[nid]) { 4273 if (zone_movable_pfn[nid]) {
4274 /* Size ZONE_MOVABLE */ 4274 /* Size ZONE_MOVABLE */
4275 if (zone_type == ZONE_MOVABLE) { 4275 if (zone_type == ZONE_MOVABLE) {
4276 *zone_start_pfn = zone_movable_pfn[nid]; 4276 *zone_start_pfn = zone_movable_pfn[nid];
4277 *zone_end_pfn = min(node_end_pfn, 4277 *zone_end_pfn = min(node_end_pfn,
4278 arch_zone_highest_possible_pfn[movable_zone]); 4278 arch_zone_highest_possible_pfn[movable_zone]);
4279 4279
4280 /* Adjust for ZONE_MOVABLE starting within this range */ 4280 /* Adjust for ZONE_MOVABLE starting within this range */
4281 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4281 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4282 *zone_end_pfn > zone_movable_pfn[nid]) { 4282 *zone_end_pfn > zone_movable_pfn[nid]) {
4283 *zone_end_pfn = zone_movable_pfn[nid]; 4283 *zone_end_pfn = zone_movable_pfn[nid];
4284 4284
4285 /* Check if this whole range is within ZONE_MOVABLE */ 4285 /* Check if this whole range is within ZONE_MOVABLE */
4286 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4286 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4287 *zone_start_pfn = *zone_end_pfn; 4287 *zone_start_pfn = *zone_end_pfn;
4288 } 4288 }
4289 } 4289 }
4290 4290
4291 /* 4291 /*
4292 * Return the number of pages a zone spans in a node, including holes 4292 * Return the number of pages a zone spans in a node, including holes
4293 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4293 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4294 */ 4294 */
4295 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4295 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4296 unsigned long zone_type, 4296 unsigned long zone_type,
4297 unsigned long *ignored) 4297 unsigned long *ignored)
4298 { 4298 {
4299 unsigned long node_start_pfn, node_end_pfn; 4299 unsigned long node_start_pfn, node_end_pfn;
4300 unsigned long zone_start_pfn, zone_end_pfn; 4300 unsigned long zone_start_pfn, zone_end_pfn;
4301 4301
4302 /* Get the start and end of the node and zone */ 4302 /* Get the start and end of the node and zone */
4303 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4303 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4304 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4304 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4305 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4305 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4306 adjust_zone_range_for_zone_movable(nid, zone_type, 4306 adjust_zone_range_for_zone_movable(nid, zone_type,
4307 node_start_pfn, node_end_pfn, 4307 node_start_pfn, node_end_pfn,
4308 &zone_start_pfn, &zone_end_pfn); 4308 &zone_start_pfn, &zone_end_pfn);
4309 4309
4310 /* Check that this node has pages within the zone's required range */ 4310 /* Check that this node has pages within the zone's required range */
4311 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4311 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4312 return 0; 4312 return 0;
4313 4313
4314 /* Move the zone boundaries inside the node if necessary */ 4314 /* Move the zone boundaries inside the node if necessary */
4315 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4315 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4316 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4316 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4317 4317
4318 /* Return the spanned pages */ 4318 /* Return the spanned pages */
4319 return zone_end_pfn - zone_start_pfn; 4319 return zone_end_pfn - zone_start_pfn;
4320 } 4320 }
4321 4321
4322 /* 4322 /*
4323 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4323 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4324 * then all holes in the requested range will be accounted for. 4324 * then all holes in the requested range will be accounted for.
4325 */ 4325 */
4326 unsigned long __meminit __absent_pages_in_range(int nid, 4326 unsigned long __meminit __absent_pages_in_range(int nid,
4327 unsigned long range_start_pfn, 4327 unsigned long range_start_pfn,
4328 unsigned long range_end_pfn) 4328 unsigned long range_end_pfn)
4329 { 4329 {
4330 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4330 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4331 unsigned long start_pfn, end_pfn; 4331 unsigned long start_pfn, end_pfn;
4332 int i; 4332 int i;
4333 4333
4334 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4334 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4335 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4335 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4336 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4336 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4337 nr_absent -= end_pfn - start_pfn; 4337 nr_absent -= end_pfn - start_pfn;
4338 } 4338 }
4339 return nr_absent; 4339 return nr_absent;
4340 } 4340 }
4341 4341
4342 /** 4342 /**
4343 * absent_pages_in_range - Return number of page frames in holes within a range 4343 * absent_pages_in_range - Return number of page frames in holes within a range
4344 * @start_pfn: The start PFN to start searching for holes 4344 * @start_pfn: The start PFN to start searching for holes
4345 * @end_pfn: The end PFN to stop searching for holes 4345 * @end_pfn: The end PFN to stop searching for holes
4346 * 4346 *
4347 * It returns the number of pages frames in memory holes within a range. 4347 * It returns the number of pages frames in memory holes within a range.
4348 */ 4348 */
4349 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4349 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4350 unsigned long end_pfn) 4350 unsigned long end_pfn)
4351 { 4351 {
4352 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4352 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4353 } 4353 }
4354 4354
4355 /* Return the number of page frames in holes in a zone on a node */ 4355 /* Return the number of page frames in holes in a zone on a node */
4356 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4356 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4357 unsigned long zone_type, 4357 unsigned long zone_type,
4358 unsigned long *ignored) 4358 unsigned long *ignored)
4359 { 4359 {
4360 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4360 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4361 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4361 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4362 unsigned long node_start_pfn, node_end_pfn; 4362 unsigned long node_start_pfn, node_end_pfn;
4363 unsigned long zone_start_pfn, zone_end_pfn; 4363 unsigned long zone_start_pfn, zone_end_pfn;
4364 4364
4365 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4365 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4366 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4366 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4367 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4367 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4368 4368
4369 adjust_zone_range_for_zone_movable(nid, zone_type, 4369 adjust_zone_range_for_zone_movable(nid, zone_type,
4370 node_start_pfn, node_end_pfn, 4370 node_start_pfn, node_end_pfn,
4371 &zone_start_pfn, &zone_end_pfn); 4371 &zone_start_pfn, &zone_end_pfn);
4372 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4372 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4373 } 4373 }
4374 4374
4375 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4375 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4376 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4376 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4377 unsigned long zone_type, 4377 unsigned long zone_type,
4378 unsigned long *zones_size) 4378 unsigned long *zones_size)
4379 { 4379 {
4380 return zones_size[zone_type]; 4380 return zones_size[zone_type];
4381 } 4381 }
4382 4382
4383 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4383 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4384 unsigned long zone_type, 4384 unsigned long zone_type,
4385 unsigned long *zholes_size) 4385 unsigned long *zholes_size)
4386 { 4386 {
4387 if (!zholes_size) 4387 if (!zholes_size)
4388 return 0; 4388 return 0;
4389 4389
4390 return zholes_size[zone_type]; 4390 return zholes_size[zone_type];
4391 } 4391 }
4392 4392
4393 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4393 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4394 4394
4395 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4395 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4396 unsigned long *zones_size, unsigned long *zholes_size) 4396 unsigned long *zones_size, unsigned long *zholes_size)
4397 { 4397 {
4398 unsigned long realtotalpages, totalpages = 0; 4398 unsigned long realtotalpages, totalpages = 0;
4399 enum zone_type i; 4399 enum zone_type i;
4400 4400
4401 for (i = 0; i < MAX_NR_ZONES; i++) 4401 for (i = 0; i < MAX_NR_ZONES; i++)
4402 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4402 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4403 zones_size); 4403 zones_size);
4404 pgdat->node_spanned_pages = totalpages; 4404 pgdat->node_spanned_pages = totalpages;
4405 4405
4406 realtotalpages = totalpages; 4406 realtotalpages = totalpages;
4407 for (i = 0; i < MAX_NR_ZONES; i++) 4407 for (i = 0; i < MAX_NR_ZONES; i++)
4408 realtotalpages -= 4408 realtotalpages -=
4409 zone_absent_pages_in_node(pgdat->node_id, i, 4409 zone_absent_pages_in_node(pgdat->node_id, i,
4410 zholes_size); 4410 zholes_size);
4411 pgdat->node_present_pages = realtotalpages; 4411 pgdat->node_present_pages = realtotalpages;
4412 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4412 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4413 realtotalpages); 4413 realtotalpages);
4414 } 4414 }
4415 4415
4416 #ifndef CONFIG_SPARSEMEM 4416 #ifndef CONFIG_SPARSEMEM
4417 /* 4417 /*
4418 * Calculate the size of the zone->blockflags rounded to an unsigned long 4418 * Calculate the size of the zone->blockflags rounded to an unsigned long
4419 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4419 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4420 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4420 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4421 * round what is now in bits to nearest long in bits, then return it in 4421 * round what is now in bits to nearest long in bits, then return it in
4422 * bytes. 4422 * bytes.
4423 */ 4423 */
4424 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4424 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4425 { 4425 {
4426 unsigned long usemapsize; 4426 unsigned long usemapsize;
4427 4427
4428 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4428 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4429 usemapsize = roundup(zonesize, pageblock_nr_pages); 4429 usemapsize = roundup(zonesize, pageblock_nr_pages);
4430 usemapsize = usemapsize >> pageblock_order; 4430 usemapsize = usemapsize >> pageblock_order;
4431 usemapsize *= NR_PAGEBLOCK_BITS; 4431 usemapsize *= NR_PAGEBLOCK_BITS;
4432 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4432 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4433 4433
4434 return usemapsize / 8; 4434 return usemapsize / 8;
4435 } 4435 }
4436 4436
4437 static void __init setup_usemap(struct pglist_data *pgdat, 4437 static void __init setup_usemap(struct pglist_data *pgdat,
4438 struct zone *zone, 4438 struct zone *zone,
4439 unsigned long zone_start_pfn, 4439 unsigned long zone_start_pfn,
4440 unsigned long zonesize) 4440 unsigned long zonesize)
4441 { 4441 {
4442 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4442 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4443 zone->pageblock_flags = NULL; 4443 zone->pageblock_flags = NULL;
4444 if (usemapsize) 4444 if (usemapsize)
4445 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4445 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4446 usemapsize); 4446 usemapsize);
4447 } 4447 }
4448 #else 4448 #else
4449 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4449 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4450 unsigned long zone_start_pfn, unsigned long zonesize) {} 4450 unsigned long zone_start_pfn, unsigned long zonesize) {}
4451 #endif /* CONFIG_SPARSEMEM */ 4451 #endif /* CONFIG_SPARSEMEM */
4452 4452
4453 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4453 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4454 4454
4455 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4455 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4456 void __init set_pageblock_order(void) 4456 void __init set_pageblock_order(void)
4457 { 4457 {
4458 unsigned int order; 4458 unsigned int order;
4459 4459
4460 /* Check that pageblock_nr_pages has not already been setup */ 4460 /* Check that pageblock_nr_pages has not already been setup */
4461 if (pageblock_order) 4461 if (pageblock_order)
4462 return; 4462 return;
4463 4463
4464 if (HPAGE_SHIFT > PAGE_SHIFT) 4464 if (HPAGE_SHIFT > PAGE_SHIFT)
4465 order = HUGETLB_PAGE_ORDER; 4465 order = HUGETLB_PAGE_ORDER;
4466 else 4466 else
4467 order = MAX_ORDER - 1; 4467 order = MAX_ORDER - 1;
4468 4468
4469 /* 4469 /*
4470 * Assume the largest contiguous order of interest is a huge page. 4470 * Assume the largest contiguous order of interest is a huge page.
4471 * This value may be variable depending on boot parameters on IA64 and 4471 * This value may be variable depending on boot parameters on IA64 and
4472 * powerpc. 4472 * powerpc.
4473 */ 4473 */
4474 pageblock_order = order; 4474 pageblock_order = order;
4475 } 4475 }
4476 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4476 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4477 4477
4478 /* 4478 /*
4479 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4479 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4480 * is unused as pageblock_order is set at compile-time. See 4480 * is unused as pageblock_order is set at compile-time. See
4481 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4481 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4482 * the kernel config 4482 * the kernel config
4483 */ 4483 */
4484 void __init set_pageblock_order(void) 4484 void __init set_pageblock_order(void)
4485 { 4485 {
4486 } 4486 }
4487 4487
4488 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4488 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4489 4489
4490 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4490 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4491 unsigned long present_pages) 4491 unsigned long present_pages)
4492 { 4492 {
4493 unsigned long pages = spanned_pages; 4493 unsigned long pages = spanned_pages;
4494 4494
4495 /* 4495 /*
4496 * Provide a more accurate estimation if there are holes within 4496 * Provide a more accurate estimation if there are holes within
4497 * the zone and SPARSEMEM is in use. If there are holes within the 4497 * the zone and SPARSEMEM is in use. If there are holes within the
4498 * zone, each populated memory region may cost us one or two extra 4498 * zone, each populated memory region may cost us one or two extra
4499 * memmap pages due to alignment because memmap pages for each 4499 * memmap pages due to alignment because memmap pages for each
4500 * populated regions may not naturally algined on page boundary. 4500 * populated regions may not naturally algined on page boundary.
4501 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4501 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4502 */ 4502 */
4503 if (spanned_pages > present_pages + (present_pages >> 4) && 4503 if (spanned_pages > present_pages + (present_pages >> 4) &&
4504 IS_ENABLED(CONFIG_SPARSEMEM)) 4504 IS_ENABLED(CONFIG_SPARSEMEM))
4505 pages = present_pages; 4505 pages = present_pages;
4506 4506
4507 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4507 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4508 } 4508 }
4509 4509
4510 /* 4510 /*
4511 * Set up the zone data structures: 4511 * Set up the zone data structures:
4512 * - mark all pages reserved 4512 * - mark all pages reserved
4513 * - mark all memory queues empty 4513 * - mark all memory queues empty
4514 * - clear the memory bitmaps 4514 * - clear the memory bitmaps
4515 * 4515 *
4516 * NOTE: pgdat should get zeroed by caller. 4516 * NOTE: pgdat should get zeroed by caller.
4517 */ 4517 */
4518 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4518 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4519 unsigned long *zones_size, unsigned long *zholes_size) 4519 unsigned long *zones_size, unsigned long *zholes_size)
4520 { 4520 {
4521 enum zone_type j; 4521 enum zone_type j;
4522 int nid = pgdat->node_id; 4522 int nid = pgdat->node_id;
4523 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4523 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4524 int ret; 4524 int ret;
4525 4525
4526 pgdat_resize_init(pgdat); 4526 pgdat_resize_init(pgdat);
4527 #ifdef CONFIG_NUMA_BALANCING 4527 #ifdef CONFIG_NUMA_BALANCING
4528 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4528 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4529 pgdat->numabalancing_migrate_nr_pages = 0; 4529 pgdat->numabalancing_migrate_nr_pages = 0;
4530 pgdat->numabalancing_migrate_next_window = jiffies; 4530 pgdat->numabalancing_migrate_next_window = jiffies;
4531 #endif 4531 #endif
4532 init_waitqueue_head(&pgdat->kswapd_wait); 4532 init_waitqueue_head(&pgdat->kswapd_wait);
4533 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4533 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4534 pgdat_page_cgroup_init(pgdat); 4534 pgdat_page_cgroup_init(pgdat);
4535 4535
4536 for (j = 0; j < MAX_NR_ZONES; j++) { 4536 for (j = 0; j < MAX_NR_ZONES; j++) {
4537 struct zone *zone = pgdat->node_zones + j; 4537 struct zone *zone = pgdat->node_zones + j;
4538 unsigned long size, realsize, freesize, memmap_pages; 4538 unsigned long size, realsize, freesize, memmap_pages;
4539 4539
4540 size = zone_spanned_pages_in_node(nid, j, zones_size); 4540 size = zone_spanned_pages_in_node(nid, j, zones_size);
4541 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4541 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4542 zholes_size); 4542 zholes_size);
4543 4543
4544 /* 4544 /*
4545 * Adjust freesize so that it accounts for how much memory 4545 * Adjust freesize so that it accounts for how much memory
4546 * is used by this zone for memmap. This affects the watermark 4546 * is used by this zone for memmap. This affects the watermark
4547 * and per-cpu initialisations 4547 * and per-cpu initialisations
4548 */ 4548 */
4549 memmap_pages = calc_memmap_size(size, realsize); 4549 memmap_pages = calc_memmap_size(size, realsize);
4550 if (freesize >= memmap_pages) { 4550 if (freesize >= memmap_pages) {
4551 freesize -= memmap_pages; 4551 freesize -= memmap_pages;
4552 if (memmap_pages) 4552 if (memmap_pages)
4553 printk(KERN_DEBUG 4553 printk(KERN_DEBUG
4554 " %s zone: %lu pages used for memmap\n", 4554 " %s zone: %lu pages used for memmap\n",
4555 zone_names[j], memmap_pages); 4555 zone_names[j], memmap_pages);
4556 } else 4556 } else
4557 printk(KERN_WARNING 4557 printk(KERN_WARNING
4558 " %s zone: %lu pages exceeds freesize %lu\n", 4558 " %s zone: %lu pages exceeds freesize %lu\n",
4559 zone_names[j], memmap_pages, freesize); 4559 zone_names[j], memmap_pages, freesize);
4560 4560
4561 /* Account for reserved pages */ 4561 /* Account for reserved pages */
4562 if (j == 0 && freesize > dma_reserve) { 4562 if (j == 0 && freesize > dma_reserve) {
4563 freesize -= dma_reserve; 4563 freesize -= dma_reserve;
4564 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4564 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4565 zone_names[0], dma_reserve); 4565 zone_names[0], dma_reserve);
4566 } 4566 }
4567 4567
4568 if (!is_highmem_idx(j)) 4568 if (!is_highmem_idx(j))
4569 nr_kernel_pages += freesize; 4569 nr_kernel_pages += freesize;
4570 /* Charge for highmem memmap if there are enough kernel pages */ 4570 /* Charge for highmem memmap if there are enough kernel pages */
4571 else if (nr_kernel_pages > memmap_pages * 2) 4571 else if (nr_kernel_pages > memmap_pages * 2)
4572 nr_kernel_pages -= memmap_pages; 4572 nr_kernel_pages -= memmap_pages;
4573 nr_all_pages += freesize; 4573 nr_all_pages += freesize;
4574 4574
4575 zone->spanned_pages = size; 4575 zone->spanned_pages = size;
4576 zone->present_pages = freesize; 4576 zone->present_pages = freesize;
4577 /* 4577 /*
4578 * Set an approximate value for lowmem here, it will be adjusted 4578 * Set an approximate value for lowmem here, it will be adjusted
4579 * when the bootmem allocator frees pages into the buddy system. 4579 * when the bootmem allocator frees pages into the buddy system.
4580 * And all highmem pages will be managed by the buddy system. 4580 * And all highmem pages will be managed by the buddy system.
4581 */ 4581 */
4582 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4582 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4583 #ifdef CONFIG_NUMA 4583 #ifdef CONFIG_NUMA
4584 zone->node = nid; 4584 zone->node = nid;
4585 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4585 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4586 / 100; 4586 / 100;
4587 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4587 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4588 #endif 4588 #endif
4589 zone->name = zone_names[j]; 4589 zone->name = zone_names[j];
4590 spin_lock_init(&zone->lock); 4590 spin_lock_init(&zone->lock);
4591 spin_lock_init(&zone->lru_lock); 4591 spin_lock_init(&zone->lru_lock);
4592 zone_seqlock_init(zone); 4592 zone_seqlock_init(zone);
4593 zone->zone_pgdat = pgdat; 4593 zone->zone_pgdat = pgdat;
4594 4594
4595 zone_pcp_init(zone); 4595 zone_pcp_init(zone);
4596 lruvec_init(&zone->lruvec); 4596 lruvec_init(&zone->lruvec);
4597 if (!size) 4597 if (!size)
4598 continue; 4598 continue;
4599 4599
4600 set_pageblock_order(); 4600 set_pageblock_order();
4601 setup_usemap(pgdat, zone, zone_start_pfn, size); 4601 setup_usemap(pgdat, zone, zone_start_pfn, size);
4602 ret = init_currently_empty_zone(zone, zone_start_pfn, 4602 ret = init_currently_empty_zone(zone, zone_start_pfn,
4603 size, MEMMAP_EARLY); 4603 size, MEMMAP_EARLY);
4604 BUG_ON(ret); 4604 BUG_ON(ret);
4605 memmap_init(size, nid, j, zone_start_pfn); 4605 memmap_init(size, nid, j, zone_start_pfn);
4606 zone_start_pfn += size; 4606 zone_start_pfn += size;
4607 } 4607 }
4608 } 4608 }
4609 4609
4610 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4610 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4611 { 4611 {
4612 /* Skip empty nodes */ 4612 /* Skip empty nodes */
4613 if (!pgdat->node_spanned_pages) 4613 if (!pgdat->node_spanned_pages)
4614 return; 4614 return;
4615 4615
4616 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4616 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4617 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4617 /* ia64 gets its own node_mem_map, before this, without bootmem */
4618 if (!pgdat->node_mem_map) { 4618 if (!pgdat->node_mem_map) {
4619 unsigned long size, start, end; 4619 unsigned long size, start, end;
4620 struct page *map; 4620 struct page *map;
4621 4621
4622 /* 4622 /*
4623 * The zone's endpoints aren't required to be MAX_ORDER 4623 * The zone's endpoints aren't required to be MAX_ORDER
4624 * aligned but the node_mem_map endpoints must be in order 4624 * aligned but the node_mem_map endpoints must be in order
4625 * for the buddy allocator to function correctly. 4625 * for the buddy allocator to function correctly.
4626 */ 4626 */
4627 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4627 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4628 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4628 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
4629 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4629 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4630 size = (end - start) * sizeof(struct page); 4630 size = (end - start) * sizeof(struct page);
4631 map = alloc_remap(pgdat->node_id, size); 4631 map = alloc_remap(pgdat->node_id, size);
4632 if (!map) 4632 if (!map)
4633 map = alloc_bootmem_node_nopanic(pgdat, size); 4633 map = alloc_bootmem_node_nopanic(pgdat, size);
4634 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4634 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4635 } 4635 }
4636 #ifndef CONFIG_NEED_MULTIPLE_NODES 4636 #ifndef CONFIG_NEED_MULTIPLE_NODES
4637 /* 4637 /*
4638 * With no DISCONTIG, the global mem_map is just set as node 0's 4638 * With no DISCONTIG, the global mem_map is just set as node 0's
4639 */ 4639 */
4640 if (pgdat == NODE_DATA(0)) { 4640 if (pgdat == NODE_DATA(0)) {
4641 mem_map = NODE_DATA(0)->node_mem_map; 4641 mem_map = NODE_DATA(0)->node_mem_map;
4642 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4642 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4643 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4643 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4644 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4644 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4645 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4645 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4646 } 4646 }
4647 #endif 4647 #endif
4648 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4648 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4649 } 4649 }
4650 4650
4651 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4651 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4652 unsigned long node_start_pfn, unsigned long *zholes_size) 4652 unsigned long node_start_pfn, unsigned long *zholes_size)
4653 { 4653 {
4654 pg_data_t *pgdat = NODE_DATA(nid); 4654 pg_data_t *pgdat = NODE_DATA(nid);
4655 4655
4656 /* pg_data_t should be reset to zero when it's allocated */ 4656 /* pg_data_t should be reset to zero when it's allocated */
4657 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4657 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4658 4658
4659 pgdat->node_id = nid; 4659 pgdat->node_id = nid;
4660 pgdat->node_start_pfn = node_start_pfn; 4660 pgdat->node_start_pfn = node_start_pfn;
4661 init_zone_allows_reclaim(nid); 4661 init_zone_allows_reclaim(nid);
4662 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4662 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4663 4663
4664 alloc_node_mem_map(pgdat); 4664 alloc_node_mem_map(pgdat);
4665 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4665 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4666 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4666 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4667 nid, (unsigned long)pgdat, 4667 nid, (unsigned long)pgdat,
4668 (unsigned long)pgdat->node_mem_map); 4668 (unsigned long)pgdat->node_mem_map);
4669 #endif 4669 #endif
4670 4670
4671 free_area_init_core(pgdat, zones_size, zholes_size); 4671 free_area_init_core(pgdat, zones_size, zholes_size);
4672 } 4672 }
4673 4673
4674 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4674 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4675 4675
4676 #if MAX_NUMNODES > 1 4676 #if MAX_NUMNODES > 1
4677 /* 4677 /*
4678 * Figure out the number of possible node ids. 4678 * Figure out the number of possible node ids.
4679 */ 4679 */
4680 static void __init setup_nr_node_ids(void) 4680 static void __init setup_nr_node_ids(void)
4681 { 4681 {
4682 unsigned int node; 4682 unsigned int node;
4683 unsigned int highest = 0; 4683 unsigned int highest = 0;
4684 4684
4685 for_each_node_mask(node, node_possible_map) 4685 for_each_node_mask(node, node_possible_map)
4686 highest = node; 4686 highest = node;
4687 nr_node_ids = highest + 1; 4687 nr_node_ids = highest + 1;
4688 } 4688 }
4689 #else 4689 #else
4690 static inline void setup_nr_node_ids(void) 4690 static inline void setup_nr_node_ids(void)
4691 { 4691 {
4692 } 4692 }
4693 #endif 4693 #endif
4694 4694
4695 /** 4695 /**
4696 * node_map_pfn_alignment - determine the maximum internode alignment 4696 * node_map_pfn_alignment - determine the maximum internode alignment
4697 * 4697 *
4698 * This function should be called after node map is populated and sorted. 4698 * This function should be called after node map is populated and sorted.
4699 * It calculates the maximum power of two alignment which can distinguish 4699 * It calculates the maximum power of two alignment which can distinguish
4700 * all the nodes. 4700 * all the nodes.
4701 * 4701 *
4702 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4702 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4703 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4703 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4704 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4704 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4705 * shifted, 1GiB is enough and this function will indicate so. 4705 * shifted, 1GiB is enough and this function will indicate so.
4706 * 4706 *
4707 * This is used to test whether pfn -> nid mapping of the chosen memory 4707 * This is used to test whether pfn -> nid mapping of the chosen memory
4708 * model has fine enough granularity to avoid incorrect mapping for the 4708 * model has fine enough granularity to avoid incorrect mapping for the
4709 * populated node map. 4709 * populated node map.
4710 * 4710 *
4711 * Returns the determined alignment in pfn's. 0 if there is no alignment 4711 * Returns the determined alignment in pfn's. 0 if there is no alignment
4712 * requirement (single node). 4712 * requirement (single node).
4713 */ 4713 */
4714 unsigned long __init node_map_pfn_alignment(void) 4714 unsigned long __init node_map_pfn_alignment(void)
4715 { 4715 {
4716 unsigned long accl_mask = 0, last_end = 0; 4716 unsigned long accl_mask = 0, last_end = 0;
4717 unsigned long start, end, mask; 4717 unsigned long start, end, mask;
4718 int last_nid = -1; 4718 int last_nid = -1;
4719 int i, nid; 4719 int i, nid;
4720 4720
4721 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4721 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4722 if (!start || last_nid < 0 || last_nid == nid) { 4722 if (!start || last_nid < 0 || last_nid == nid) {
4723 last_nid = nid; 4723 last_nid = nid;
4724 last_end = end; 4724 last_end = end;
4725 continue; 4725 continue;
4726 } 4726 }
4727 4727
4728 /* 4728 /*
4729 * Start with a mask granular enough to pin-point to the 4729 * Start with a mask granular enough to pin-point to the
4730 * start pfn and tick off bits one-by-one until it becomes 4730 * start pfn and tick off bits one-by-one until it becomes
4731 * too coarse to separate the current node from the last. 4731 * too coarse to separate the current node from the last.
4732 */ 4732 */
4733 mask = ~((1 << __ffs(start)) - 1); 4733 mask = ~((1 << __ffs(start)) - 1);
4734 while (mask && last_end <= (start & (mask << 1))) 4734 while (mask && last_end <= (start & (mask << 1)))
4735 mask <<= 1; 4735 mask <<= 1;
4736 4736
4737 /* accumulate all internode masks */ 4737 /* accumulate all internode masks */
4738 accl_mask |= mask; 4738 accl_mask |= mask;
4739 } 4739 }
4740 4740
4741 /* convert mask to number of pages */ 4741 /* convert mask to number of pages */
4742 return ~accl_mask + 1; 4742 return ~accl_mask + 1;
4743 } 4743 }
4744 4744
4745 /* Find the lowest pfn for a node */ 4745 /* Find the lowest pfn for a node */
4746 static unsigned long __init find_min_pfn_for_node(int nid) 4746 static unsigned long __init find_min_pfn_for_node(int nid)
4747 { 4747 {
4748 unsigned long min_pfn = ULONG_MAX; 4748 unsigned long min_pfn = ULONG_MAX;
4749 unsigned long start_pfn; 4749 unsigned long start_pfn;
4750 int i; 4750 int i;
4751 4751
4752 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 4752 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4753 min_pfn = min(min_pfn, start_pfn); 4753 min_pfn = min(min_pfn, start_pfn);
4754 4754
4755 if (min_pfn == ULONG_MAX) { 4755 if (min_pfn == ULONG_MAX) {
4756 printk(KERN_WARNING 4756 printk(KERN_WARNING
4757 "Could not find start_pfn for node %d\n", nid); 4757 "Could not find start_pfn for node %d\n", nid);
4758 return 0; 4758 return 0;
4759 } 4759 }
4760 4760
4761 return min_pfn; 4761 return min_pfn;
4762 } 4762 }
4763 4763
4764 /** 4764 /**
4765 * find_min_pfn_with_active_regions - Find the minimum PFN registered 4765 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4766 * 4766 *
4767 * It returns the minimum PFN based on information provided via 4767 * It returns the minimum PFN based on information provided via
4768 * add_active_range(). 4768 * add_active_range().
4769 */ 4769 */
4770 unsigned long __init find_min_pfn_with_active_regions(void) 4770 unsigned long __init find_min_pfn_with_active_regions(void)
4771 { 4771 {
4772 return find_min_pfn_for_node(MAX_NUMNODES); 4772 return find_min_pfn_for_node(MAX_NUMNODES);
4773 } 4773 }
4774 4774
4775 /* 4775 /*
4776 * early_calculate_totalpages() 4776 * early_calculate_totalpages()
4777 * Sum pages in active regions for movable zone. 4777 * Sum pages in active regions for movable zone.
4778 * Populate N_MEMORY for calculating usable_nodes. 4778 * Populate N_MEMORY for calculating usable_nodes.
4779 */ 4779 */
4780 static unsigned long __init early_calculate_totalpages(void) 4780 static unsigned long __init early_calculate_totalpages(void)
4781 { 4781 {
4782 unsigned long totalpages = 0; 4782 unsigned long totalpages = 0;
4783 unsigned long start_pfn, end_pfn; 4783 unsigned long start_pfn, end_pfn;
4784 int i, nid; 4784 int i, nid;
4785 4785
4786 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 4786 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4787 unsigned long pages = end_pfn - start_pfn; 4787 unsigned long pages = end_pfn - start_pfn;
4788 4788
4789 totalpages += pages; 4789 totalpages += pages;
4790 if (pages) 4790 if (pages)
4791 node_set_state(nid, N_MEMORY); 4791 node_set_state(nid, N_MEMORY);
4792 } 4792 }
4793 return totalpages; 4793 return totalpages;
4794 } 4794 }
4795 4795
4796 /* 4796 /*
4797 * Find the PFN the Movable zone begins in each node. Kernel memory 4797 * Find the PFN the Movable zone begins in each node. Kernel memory
4798 * is spread evenly between nodes as long as the nodes have enough 4798 * is spread evenly between nodes as long as the nodes have enough
4799 * memory. When they don't, some nodes will have more kernelcore than 4799 * memory. When they don't, some nodes will have more kernelcore than
4800 * others 4800 * others
4801 */ 4801 */
4802 static void __init find_zone_movable_pfns_for_nodes(void) 4802 static void __init find_zone_movable_pfns_for_nodes(void)
4803 { 4803 {
4804 int i, nid; 4804 int i, nid;
4805 unsigned long usable_startpfn; 4805 unsigned long usable_startpfn;
4806 unsigned long kernelcore_node, kernelcore_remaining; 4806 unsigned long kernelcore_node, kernelcore_remaining;
4807 /* save the state before borrow the nodemask */ 4807 /* save the state before borrow the nodemask */
4808 nodemask_t saved_node_state = node_states[N_MEMORY]; 4808 nodemask_t saved_node_state = node_states[N_MEMORY];
4809 unsigned long totalpages = early_calculate_totalpages(); 4809 unsigned long totalpages = early_calculate_totalpages();
4810 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 4810 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4811 4811
4812 /* 4812 /*
4813 * If movablecore was specified, calculate what size of 4813 * If movablecore was specified, calculate what size of
4814 * kernelcore that corresponds so that memory usable for 4814 * kernelcore that corresponds so that memory usable for
4815 * any allocation type is evenly spread. If both kernelcore 4815 * any allocation type is evenly spread. If both kernelcore
4816 * and movablecore are specified, then the value of kernelcore 4816 * and movablecore are specified, then the value of kernelcore
4817 * will be used for required_kernelcore if it's greater than 4817 * will be used for required_kernelcore if it's greater than
4818 * what movablecore would have allowed. 4818 * what movablecore would have allowed.
4819 */ 4819 */
4820 if (required_movablecore) { 4820 if (required_movablecore) {
4821 unsigned long corepages; 4821 unsigned long corepages;
4822 4822
4823 /* 4823 /*
4824 * Round-up so that ZONE_MOVABLE is at least as large as what 4824 * Round-up so that ZONE_MOVABLE is at least as large as what
4825 * was requested by the user 4825 * was requested by the user
4826 */ 4826 */
4827 required_movablecore = 4827 required_movablecore =
4828 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 4828 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4829 corepages = totalpages - required_movablecore; 4829 corepages = totalpages - required_movablecore;
4830 4830
4831 required_kernelcore = max(required_kernelcore, corepages); 4831 required_kernelcore = max(required_kernelcore, corepages);
4832 } 4832 }
4833 4833
4834 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4834 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4835 if (!required_kernelcore) 4835 if (!required_kernelcore)
4836 goto out; 4836 goto out;
4837 4837
4838 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4838 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4839 find_usable_zone_for_movable(); 4839 find_usable_zone_for_movable();
4840 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4840 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4841 4841
4842 restart: 4842 restart:
4843 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4843 /* Spread kernelcore memory as evenly as possible throughout nodes */
4844 kernelcore_node = required_kernelcore / usable_nodes; 4844 kernelcore_node = required_kernelcore / usable_nodes;
4845 for_each_node_state(nid, N_MEMORY) { 4845 for_each_node_state(nid, N_MEMORY) {
4846 unsigned long start_pfn, end_pfn; 4846 unsigned long start_pfn, end_pfn;
4847 4847
4848 /* 4848 /*
4849 * Recalculate kernelcore_node if the division per node 4849 * Recalculate kernelcore_node if the division per node
4850 * now exceeds what is necessary to satisfy the requested 4850 * now exceeds what is necessary to satisfy the requested
4851 * amount of memory for the kernel 4851 * amount of memory for the kernel
4852 */ 4852 */
4853 if (required_kernelcore < kernelcore_node) 4853 if (required_kernelcore < kernelcore_node)
4854 kernelcore_node = required_kernelcore / usable_nodes; 4854 kernelcore_node = required_kernelcore / usable_nodes;
4855 4855
4856 /* 4856 /*
4857 * As the map is walked, we track how much memory is usable 4857 * As the map is walked, we track how much memory is usable
4858 * by the kernel using kernelcore_remaining. When it is 4858 * by the kernel using kernelcore_remaining. When it is
4859 * 0, the rest of the node is usable by ZONE_MOVABLE 4859 * 0, the rest of the node is usable by ZONE_MOVABLE
4860 */ 4860 */
4861 kernelcore_remaining = kernelcore_node; 4861 kernelcore_remaining = kernelcore_node;
4862 4862
4863 /* Go through each range of PFNs within this node */ 4863 /* Go through each range of PFNs within this node */
4864 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4864 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4865 unsigned long size_pages; 4865 unsigned long size_pages;
4866 4866
4867 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4867 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4868 if (start_pfn >= end_pfn) 4868 if (start_pfn >= end_pfn)
4869 continue; 4869 continue;
4870 4870
4871 /* Account for what is only usable for kernelcore */ 4871 /* Account for what is only usable for kernelcore */
4872 if (start_pfn < usable_startpfn) { 4872 if (start_pfn < usable_startpfn) {
4873 unsigned long kernel_pages; 4873 unsigned long kernel_pages;
4874 kernel_pages = min(end_pfn, usable_startpfn) 4874 kernel_pages = min(end_pfn, usable_startpfn)
4875 - start_pfn; 4875 - start_pfn;
4876 4876
4877 kernelcore_remaining -= min(kernel_pages, 4877 kernelcore_remaining -= min(kernel_pages,
4878 kernelcore_remaining); 4878 kernelcore_remaining);
4879 required_kernelcore -= min(kernel_pages, 4879 required_kernelcore -= min(kernel_pages,
4880 required_kernelcore); 4880 required_kernelcore);
4881 4881
4882 /* Continue if range is now fully accounted */ 4882 /* Continue if range is now fully accounted */
4883 if (end_pfn <= usable_startpfn) { 4883 if (end_pfn <= usable_startpfn) {
4884 4884
4885 /* 4885 /*
4886 * Push zone_movable_pfn to the end so 4886 * Push zone_movable_pfn to the end so
4887 * that if we have to rebalance 4887 * that if we have to rebalance
4888 * kernelcore across nodes, we will 4888 * kernelcore across nodes, we will
4889 * not double account here 4889 * not double account here
4890 */ 4890 */
4891 zone_movable_pfn[nid] = end_pfn; 4891 zone_movable_pfn[nid] = end_pfn;
4892 continue; 4892 continue;
4893 } 4893 }
4894 start_pfn = usable_startpfn; 4894 start_pfn = usable_startpfn;
4895 } 4895 }
4896 4896
4897 /* 4897 /*
4898 * The usable PFN range for ZONE_MOVABLE is from 4898 * The usable PFN range for ZONE_MOVABLE is from
4899 * start_pfn->end_pfn. Calculate size_pages as the 4899 * start_pfn->end_pfn. Calculate size_pages as the
4900 * number of pages used as kernelcore 4900 * number of pages used as kernelcore
4901 */ 4901 */
4902 size_pages = end_pfn - start_pfn; 4902 size_pages = end_pfn - start_pfn;
4903 if (size_pages > kernelcore_remaining) 4903 if (size_pages > kernelcore_remaining)
4904 size_pages = kernelcore_remaining; 4904 size_pages = kernelcore_remaining;
4905 zone_movable_pfn[nid] = start_pfn + size_pages; 4905 zone_movable_pfn[nid] = start_pfn + size_pages;
4906 4906
4907 /* 4907 /*
4908 * Some kernelcore has been met, update counts and 4908 * Some kernelcore has been met, update counts and
4909 * break if the kernelcore for this node has been 4909 * break if the kernelcore for this node has been
4910 * satisified 4910 * satisified
4911 */ 4911 */
4912 required_kernelcore -= min(required_kernelcore, 4912 required_kernelcore -= min(required_kernelcore,
4913 size_pages); 4913 size_pages);
4914 kernelcore_remaining -= size_pages; 4914 kernelcore_remaining -= size_pages;
4915 if (!kernelcore_remaining) 4915 if (!kernelcore_remaining)
4916 break; 4916 break;
4917 } 4917 }
4918 } 4918 }
4919 4919
4920 /* 4920 /*
4921 * If there is still required_kernelcore, we do another pass with one 4921 * If there is still required_kernelcore, we do another pass with one
4922 * less node in the count. This will push zone_movable_pfn[nid] further 4922 * less node in the count. This will push zone_movable_pfn[nid] further
4923 * along on the nodes that still have memory until kernelcore is 4923 * along on the nodes that still have memory until kernelcore is
4924 * satisified 4924 * satisified
4925 */ 4925 */
4926 usable_nodes--; 4926 usable_nodes--;
4927 if (usable_nodes && required_kernelcore > usable_nodes) 4927 if (usable_nodes && required_kernelcore > usable_nodes)
4928 goto restart; 4928 goto restart;
4929 4929
4930 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 4930 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4931 for (nid = 0; nid < MAX_NUMNODES; nid++) 4931 for (nid = 0; nid < MAX_NUMNODES; nid++)
4932 zone_movable_pfn[nid] = 4932 zone_movable_pfn[nid] =
4933 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4933 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4934 4934
4935 out: 4935 out:
4936 /* restore the node_state */ 4936 /* restore the node_state */
4937 node_states[N_MEMORY] = saved_node_state; 4937 node_states[N_MEMORY] = saved_node_state;
4938 } 4938 }
4939 4939
4940 /* Any regular or high memory on that node ? */ 4940 /* Any regular or high memory on that node ? */
4941 static void check_for_memory(pg_data_t *pgdat, int nid) 4941 static void check_for_memory(pg_data_t *pgdat, int nid)
4942 { 4942 {
4943 enum zone_type zone_type; 4943 enum zone_type zone_type;
4944 4944
4945 if (N_MEMORY == N_NORMAL_MEMORY) 4945 if (N_MEMORY == N_NORMAL_MEMORY)
4946 return; 4946 return;
4947 4947
4948 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 4948 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4949 struct zone *zone = &pgdat->node_zones[zone_type]; 4949 struct zone *zone = &pgdat->node_zones[zone_type];
4950 if (zone->present_pages) { 4950 if (zone->present_pages) {
4951 node_set_state(nid, N_HIGH_MEMORY); 4951 node_set_state(nid, N_HIGH_MEMORY);
4952 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 4952 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4953 zone_type <= ZONE_NORMAL) 4953 zone_type <= ZONE_NORMAL)
4954 node_set_state(nid, N_NORMAL_MEMORY); 4954 node_set_state(nid, N_NORMAL_MEMORY);
4955 break; 4955 break;
4956 } 4956 }
4957 } 4957 }
4958 } 4958 }
4959 4959
4960 /** 4960 /**
4961 * free_area_init_nodes - Initialise all pg_data_t and zone data 4961 * free_area_init_nodes - Initialise all pg_data_t and zone data
4962 * @max_zone_pfn: an array of max PFNs for each zone 4962 * @max_zone_pfn: an array of max PFNs for each zone
4963 * 4963 *
4964 * This will call free_area_init_node() for each active node in the system. 4964 * This will call free_area_init_node() for each active node in the system.
4965 * Using the page ranges provided by add_active_range(), the size of each 4965 * Using the page ranges provided by add_active_range(), the size of each
4966 * zone in each node and their holes is calculated. If the maximum PFN 4966 * zone in each node and their holes is calculated. If the maximum PFN
4967 * between two adjacent zones match, it is assumed that the zone is empty. 4967 * between two adjacent zones match, it is assumed that the zone is empty.
4968 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 4968 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
4969 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 4969 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
4970 * starts where the previous one ended. For example, ZONE_DMA32 starts 4970 * starts where the previous one ended. For example, ZONE_DMA32 starts
4971 * at arch_max_dma_pfn. 4971 * at arch_max_dma_pfn.
4972 */ 4972 */
4973 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 4973 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4974 { 4974 {
4975 unsigned long start_pfn, end_pfn; 4975 unsigned long start_pfn, end_pfn;
4976 int i, nid; 4976 int i, nid;
4977 4977
4978 /* Record where the zone boundaries are */ 4978 /* Record where the zone boundaries are */
4979 memset(arch_zone_lowest_possible_pfn, 0, 4979 memset(arch_zone_lowest_possible_pfn, 0,
4980 sizeof(arch_zone_lowest_possible_pfn)); 4980 sizeof(arch_zone_lowest_possible_pfn));
4981 memset(arch_zone_highest_possible_pfn, 0, 4981 memset(arch_zone_highest_possible_pfn, 0,
4982 sizeof(arch_zone_highest_possible_pfn)); 4982 sizeof(arch_zone_highest_possible_pfn));
4983 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 4983 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4984 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 4984 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4985 for (i = 1; i < MAX_NR_ZONES; i++) { 4985 for (i = 1; i < MAX_NR_ZONES; i++) {
4986 if (i == ZONE_MOVABLE) 4986 if (i == ZONE_MOVABLE)
4987 continue; 4987 continue;
4988 arch_zone_lowest_possible_pfn[i] = 4988 arch_zone_lowest_possible_pfn[i] =
4989 arch_zone_highest_possible_pfn[i-1]; 4989 arch_zone_highest_possible_pfn[i-1];
4990 arch_zone_highest_possible_pfn[i] = 4990 arch_zone_highest_possible_pfn[i] =
4991 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 4991 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4992 } 4992 }
4993 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 4993 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4994 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 4994 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4995 4995
4996 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4996 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4997 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4997 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4998 find_zone_movable_pfns_for_nodes(); 4998 find_zone_movable_pfns_for_nodes();
4999 4999
5000 /* Print out the zone ranges */ 5000 /* Print out the zone ranges */
5001 printk("Zone ranges:\n"); 5001 printk("Zone ranges:\n");
5002 for (i = 0; i < MAX_NR_ZONES; i++) { 5002 for (i = 0; i < MAX_NR_ZONES; i++) {
5003 if (i == ZONE_MOVABLE) 5003 if (i == ZONE_MOVABLE)
5004 continue; 5004 continue;
5005 printk(KERN_CONT " %-8s ", zone_names[i]); 5005 printk(KERN_CONT " %-8s ", zone_names[i]);
5006 if (arch_zone_lowest_possible_pfn[i] == 5006 if (arch_zone_lowest_possible_pfn[i] ==
5007 arch_zone_highest_possible_pfn[i]) 5007 arch_zone_highest_possible_pfn[i])
5008 printk(KERN_CONT "empty\n"); 5008 printk(KERN_CONT "empty\n");
5009 else 5009 else
5010 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5010 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5011 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5011 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5012 (arch_zone_highest_possible_pfn[i] 5012 (arch_zone_highest_possible_pfn[i]
5013 << PAGE_SHIFT) - 1); 5013 << PAGE_SHIFT) - 1);
5014 } 5014 }
5015 5015
5016 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5016 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5017 printk("Movable zone start for each node\n"); 5017 printk("Movable zone start for each node\n");
5018 for (i = 0; i < MAX_NUMNODES; i++) { 5018 for (i = 0; i < MAX_NUMNODES; i++) {
5019 if (zone_movable_pfn[i]) 5019 if (zone_movable_pfn[i])
5020 printk(" Node %d: %#010lx\n", i, 5020 printk(" Node %d: %#010lx\n", i,
5021 zone_movable_pfn[i] << PAGE_SHIFT); 5021 zone_movable_pfn[i] << PAGE_SHIFT);
5022 } 5022 }
5023 5023
5024 /* Print out the early node map */ 5024 /* Print out the early node map */
5025 printk("Early memory node ranges\n"); 5025 printk("Early memory node ranges\n");
5026 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5026 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5027 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5027 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5028 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5028 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5029 5029
5030 /* Initialise every node */ 5030 /* Initialise every node */
5031 mminit_verify_pageflags_layout(); 5031 mminit_verify_pageflags_layout();
5032 setup_nr_node_ids(); 5032 setup_nr_node_ids();
5033 for_each_online_node(nid) { 5033 for_each_online_node(nid) {
5034 pg_data_t *pgdat = NODE_DATA(nid); 5034 pg_data_t *pgdat = NODE_DATA(nid);
5035 free_area_init_node(nid, NULL, 5035 free_area_init_node(nid, NULL,
5036 find_min_pfn_for_node(nid), NULL); 5036 find_min_pfn_for_node(nid), NULL);
5037 5037
5038 /* Any memory on that node */ 5038 /* Any memory on that node */
5039 if (pgdat->node_present_pages) 5039 if (pgdat->node_present_pages)
5040 node_set_state(nid, N_MEMORY); 5040 node_set_state(nid, N_MEMORY);
5041 check_for_memory(pgdat, nid); 5041 check_for_memory(pgdat, nid);
5042 } 5042 }
5043 } 5043 }
5044 5044
5045 static int __init cmdline_parse_core(char *p, unsigned long *core) 5045 static int __init cmdline_parse_core(char *p, unsigned long *core)
5046 { 5046 {
5047 unsigned long long coremem; 5047 unsigned long long coremem;
5048 if (!p) 5048 if (!p)
5049 return -EINVAL; 5049 return -EINVAL;
5050 5050
5051 coremem = memparse(p, &p); 5051 coremem = memparse(p, &p);
5052 *core = coremem >> PAGE_SHIFT; 5052 *core = coremem >> PAGE_SHIFT;
5053 5053
5054 /* Paranoid check that UL is enough for the coremem value */ 5054 /* Paranoid check that UL is enough for the coremem value */
5055 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5055 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5056 5056
5057 return 0; 5057 return 0;
5058 } 5058 }
5059 5059
5060 /* 5060 /*
5061 * kernelcore=size sets the amount of memory for use for allocations that 5061 * kernelcore=size sets the amount of memory for use for allocations that
5062 * cannot be reclaimed or migrated. 5062 * cannot be reclaimed or migrated.
5063 */ 5063 */
5064 static int __init cmdline_parse_kernelcore(char *p) 5064 static int __init cmdline_parse_kernelcore(char *p)
5065 { 5065 {
5066 return cmdline_parse_core(p, &required_kernelcore); 5066 return cmdline_parse_core(p, &required_kernelcore);
5067 } 5067 }
5068 5068
5069 /* 5069 /*
5070 * movablecore=size sets the amount of memory for use for allocations that 5070 * movablecore=size sets the amount of memory for use for allocations that
5071 * can be reclaimed or migrated. 5071 * can be reclaimed or migrated.
5072 */ 5072 */
5073 static int __init cmdline_parse_movablecore(char *p) 5073 static int __init cmdline_parse_movablecore(char *p)
5074 { 5074 {
5075 return cmdline_parse_core(p, &required_movablecore); 5075 return cmdline_parse_core(p, &required_movablecore);
5076 } 5076 }
5077 5077
5078 early_param("kernelcore", cmdline_parse_kernelcore); 5078 early_param("kernelcore", cmdline_parse_kernelcore);
5079 early_param("movablecore", cmdline_parse_movablecore); 5079 early_param("movablecore", cmdline_parse_movablecore);
5080 5080
5081 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5081 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5082 5082
5083 /** 5083 /**
5084 * set_dma_reserve - set the specified number of pages reserved in the first zone 5084 * set_dma_reserve - set the specified number of pages reserved in the first zone
5085 * @new_dma_reserve: The number of pages to mark reserved 5085 * @new_dma_reserve: The number of pages to mark reserved
5086 * 5086 *
5087 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5087 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5088 * In the DMA zone, a significant percentage may be consumed by kernel image 5088 * In the DMA zone, a significant percentage may be consumed by kernel image
5089 * and other unfreeable allocations which can skew the watermarks badly. This 5089 * and other unfreeable allocations which can skew the watermarks badly. This
5090 * function may optionally be used to account for unfreeable pages in the 5090 * function may optionally be used to account for unfreeable pages in the
5091 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5091 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5092 * smaller per-cpu batchsize. 5092 * smaller per-cpu batchsize.
5093 */ 5093 */
5094 void __init set_dma_reserve(unsigned long new_dma_reserve) 5094 void __init set_dma_reserve(unsigned long new_dma_reserve)
5095 { 5095 {
5096 dma_reserve = new_dma_reserve; 5096 dma_reserve = new_dma_reserve;
5097 } 5097 }
5098 5098
5099 void __init free_area_init(unsigned long *zones_size) 5099 void __init free_area_init(unsigned long *zones_size)
5100 { 5100 {
5101 free_area_init_node(0, zones_size, 5101 free_area_init_node(0, zones_size,
5102 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5102 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5103 } 5103 }
5104 5104
5105 static int page_alloc_cpu_notify(struct notifier_block *self, 5105 static int page_alloc_cpu_notify(struct notifier_block *self,
5106 unsigned long action, void *hcpu) 5106 unsigned long action, void *hcpu)
5107 { 5107 {
5108 int cpu = (unsigned long)hcpu; 5108 int cpu = (unsigned long)hcpu;
5109 5109
5110 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5110 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5111 lru_add_drain_cpu(cpu); 5111 lru_add_drain_cpu(cpu);
5112 drain_pages(cpu); 5112 drain_pages(cpu);
5113 5113
5114 /* 5114 /*
5115 * Spill the event counters of the dead processor 5115 * Spill the event counters of the dead processor
5116 * into the current processors event counters. 5116 * into the current processors event counters.
5117 * This artificially elevates the count of the current 5117 * This artificially elevates the count of the current
5118 * processor. 5118 * processor.
5119 */ 5119 */
5120 vm_events_fold_cpu(cpu); 5120 vm_events_fold_cpu(cpu);
5121 5121
5122 /* 5122 /*
5123 * Zero the differential counters of the dead processor 5123 * Zero the differential counters of the dead processor
5124 * so that the vm statistics are consistent. 5124 * so that the vm statistics are consistent.
5125 * 5125 *
5126 * This is only okay since the processor is dead and cannot 5126 * This is only okay since the processor is dead and cannot
5127 * race with what we are doing. 5127 * race with what we are doing.
5128 */ 5128 */
5129 refresh_cpu_vm_stats(cpu); 5129 refresh_cpu_vm_stats(cpu);
5130 } 5130 }
5131 return NOTIFY_OK; 5131 return NOTIFY_OK;
5132 } 5132 }
5133 5133
5134 void __init page_alloc_init(void) 5134 void __init page_alloc_init(void)
5135 { 5135 {
5136 hotcpu_notifier(page_alloc_cpu_notify, 0); 5136 hotcpu_notifier(page_alloc_cpu_notify, 0);
5137 } 5137 }
5138 5138
5139 /* 5139 /*
5140 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5140 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5141 * or min_free_kbytes changes. 5141 * or min_free_kbytes changes.
5142 */ 5142 */
5143 static void calculate_totalreserve_pages(void) 5143 static void calculate_totalreserve_pages(void)
5144 { 5144 {
5145 struct pglist_data *pgdat; 5145 struct pglist_data *pgdat;
5146 unsigned long reserve_pages = 0; 5146 unsigned long reserve_pages = 0;
5147 enum zone_type i, j; 5147 enum zone_type i, j;
5148 5148
5149 for_each_online_pgdat(pgdat) { 5149 for_each_online_pgdat(pgdat) {
5150 for (i = 0; i < MAX_NR_ZONES; i++) { 5150 for (i = 0; i < MAX_NR_ZONES; i++) {
5151 struct zone *zone = pgdat->node_zones + i; 5151 struct zone *zone = pgdat->node_zones + i;
5152 unsigned long max = 0; 5152 unsigned long max = 0;
5153 5153
5154 /* Find valid and maximum lowmem_reserve in the zone */ 5154 /* Find valid and maximum lowmem_reserve in the zone */
5155 for (j = i; j < MAX_NR_ZONES; j++) { 5155 for (j = i; j < MAX_NR_ZONES; j++) {
5156 if (zone->lowmem_reserve[j] > max) 5156 if (zone->lowmem_reserve[j] > max)
5157 max = zone->lowmem_reserve[j]; 5157 max = zone->lowmem_reserve[j];
5158 } 5158 }
5159 5159
5160 /* we treat the high watermark as reserved pages. */ 5160 /* we treat the high watermark as reserved pages. */
5161 max += high_wmark_pages(zone); 5161 max += high_wmark_pages(zone);
5162 5162
5163 if (max > zone->present_pages) 5163 if (max > zone->present_pages)
5164 max = zone->present_pages; 5164 max = zone->present_pages;
5165 reserve_pages += max; 5165 reserve_pages += max;
5166 /* 5166 /*
5167 * Lowmem reserves are not available to 5167 * Lowmem reserves are not available to
5168 * GFP_HIGHUSER page cache allocations and 5168 * GFP_HIGHUSER page cache allocations and
5169 * kswapd tries to balance zones to their high 5169 * kswapd tries to balance zones to their high
5170 * watermark. As a result, neither should be 5170 * watermark. As a result, neither should be
5171 * regarded as dirtyable memory, to prevent a 5171 * regarded as dirtyable memory, to prevent a
5172 * situation where reclaim has to clean pages 5172 * situation where reclaim has to clean pages
5173 * in order to balance the zones. 5173 * in order to balance the zones.
5174 */ 5174 */
5175 zone->dirty_balance_reserve = max; 5175 zone->dirty_balance_reserve = max;
5176 } 5176 }
5177 } 5177 }
5178 dirty_balance_reserve = reserve_pages; 5178 dirty_balance_reserve = reserve_pages;
5179 totalreserve_pages = reserve_pages; 5179 totalreserve_pages = reserve_pages;
5180 } 5180 }
5181 5181
5182 /* 5182 /*
5183 * setup_per_zone_lowmem_reserve - called whenever 5183 * setup_per_zone_lowmem_reserve - called whenever
5184 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5184 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5185 * has a correct pages reserved value, so an adequate number of 5185 * has a correct pages reserved value, so an adequate number of
5186 * pages are left in the zone after a successful __alloc_pages(). 5186 * pages are left in the zone after a successful __alloc_pages().
5187 */ 5187 */
5188 static void setup_per_zone_lowmem_reserve(void) 5188 static void setup_per_zone_lowmem_reserve(void)
5189 { 5189 {
5190 struct pglist_data *pgdat; 5190 struct pglist_data *pgdat;
5191 enum zone_type j, idx; 5191 enum zone_type j, idx;
5192 5192
5193 for_each_online_pgdat(pgdat) { 5193 for_each_online_pgdat(pgdat) {
5194 for (j = 0; j < MAX_NR_ZONES; j++) { 5194 for (j = 0; j < MAX_NR_ZONES; j++) {
5195 struct zone *zone = pgdat->node_zones + j; 5195 struct zone *zone = pgdat->node_zones + j;
5196 unsigned long present_pages = zone->present_pages; 5196 unsigned long present_pages = zone->present_pages;
5197 5197
5198 zone->lowmem_reserve[j] = 0; 5198 zone->lowmem_reserve[j] = 0;
5199 5199
5200 idx = j; 5200 idx = j;
5201 while (idx) { 5201 while (idx) {
5202 struct zone *lower_zone; 5202 struct zone *lower_zone;
5203 5203
5204 idx--; 5204 idx--;
5205 5205
5206 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5206 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5207 sysctl_lowmem_reserve_ratio[idx] = 1; 5207 sysctl_lowmem_reserve_ratio[idx] = 1;
5208 5208
5209 lower_zone = pgdat->node_zones + idx; 5209 lower_zone = pgdat->node_zones + idx;
5210 lower_zone->lowmem_reserve[j] = present_pages / 5210 lower_zone->lowmem_reserve[j] = present_pages /
5211 sysctl_lowmem_reserve_ratio[idx]; 5211 sysctl_lowmem_reserve_ratio[idx];
5212 present_pages += lower_zone->present_pages; 5212 present_pages += lower_zone->present_pages;
5213 } 5213 }
5214 } 5214 }
5215 } 5215 }
5216 5216
5217 /* update totalreserve_pages */ 5217 /* update totalreserve_pages */
5218 calculate_totalreserve_pages(); 5218 calculate_totalreserve_pages();
5219 } 5219 }
5220 5220
5221 static void __setup_per_zone_wmarks(void) 5221 static void __setup_per_zone_wmarks(void)
5222 { 5222 {
5223 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5223 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5224 unsigned long lowmem_pages = 0; 5224 unsigned long lowmem_pages = 0;
5225 struct zone *zone; 5225 struct zone *zone;
5226 unsigned long flags; 5226 unsigned long flags;
5227 5227
5228 /* Calculate total number of !ZONE_HIGHMEM pages */ 5228 /* Calculate total number of !ZONE_HIGHMEM pages */
5229 for_each_zone(zone) { 5229 for_each_zone(zone) {
5230 if (!is_highmem(zone)) 5230 if (!is_highmem(zone))
5231 lowmem_pages += zone->present_pages; 5231 lowmem_pages += zone->present_pages;
5232 } 5232 }
5233 5233
5234 for_each_zone(zone) { 5234 for_each_zone(zone) {
5235 u64 tmp; 5235 u64 tmp;
5236 5236
5237 spin_lock_irqsave(&zone->lock, flags); 5237 spin_lock_irqsave(&zone->lock, flags);
5238 tmp = (u64)pages_min * zone->present_pages; 5238 tmp = (u64)pages_min * zone->present_pages;
5239 do_div(tmp, lowmem_pages); 5239 do_div(tmp, lowmem_pages);
5240 if (is_highmem(zone)) { 5240 if (is_highmem(zone)) {
5241 /* 5241 /*
5242 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5242 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5243 * need highmem pages, so cap pages_min to a small 5243 * need highmem pages, so cap pages_min to a small
5244 * value here. 5244 * value here.
5245 * 5245 *
5246 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5246 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5247 * deltas controls asynch page reclaim, and so should 5247 * deltas controls asynch page reclaim, and so should
5248 * not be capped for highmem. 5248 * not be capped for highmem.
5249 */ 5249 */
5250 int min_pages; 5250 int min_pages;
5251 5251
5252 min_pages = zone->present_pages / 1024; 5252 min_pages = zone->present_pages / 1024;
5253 if (min_pages < SWAP_CLUSTER_MAX) 5253 if (min_pages < SWAP_CLUSTER_MAX)
5254 min_pages = SWAP_CLUSTER_MAX; 5254 min_pages = SWAP_CLUSTER_MAX;
5255 if (min_pages > 128) 5255 if (min_pages > 128)
5256 min_pages = 128; 5256 min_pages = 128;
5257 zone->watermark[WMARK_MIN] = min_pages; 5257 zone->watermark[WMARK_MIN] = min_pages;
5258 } else { 5258 } else {
5259 /* 5259 /*
5260 * If it's a lowmem zone, reserve a number of pages 5260 * If it's a lowmem zone, reserve a number of pages
5261 * proportionate to the zone's size. 5261 * proportionate to the zone's size.
5262 */ 5262 */
5263 zone->watermark[WMARK_MIN] = tmp; 5263 zone->watermark[WMARK_MIN] = tmp;
5264 } 5264 }
5265 5265
5266 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5266 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5267 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5267 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5268 5268
5269 setup_zone_migrate_reserve(zone); 5269 setup_zone_migrate_reserve(zone);
5270 spin_unlock_irqrestore(&zone->lock, flags); 5270 spin_unlock_irqrestore(&zone->lock, flags);
5271 } 5271 }
5272 5272
5273 /* update totalreserve_pages */ 5273 /* update totalreserve_pages */
5274 calculate_totalreserve_pages(); 5274 calculate_totalreserve_pages();
5275 } 5275 }
5276 5276
5277 /** 5277 /**
5278 * setup_per_zone_wmarks - called when min_free_kbytes changes 5278 * setup_per_zone_wmarks - called when min_free_kbytes changes
5279 * or when memory is hot-{added|removed} 5279 * or when memory is hot-{added|removed}
5280 * 5280 *
5281 * Ensures that the watermark[min,low,high] values for each zone are set 5281 * Ensures that the watermark[min,low,high] values for each zone are set
5282 * correctly with respect to min_free_kbytes. 5282 * correctly with respect to min_free_kbytes.
5283 */ 5283 */
5284 void setup_per_zone_wmarks(void) 5284 void setup_per_zone_wmarks(void)
5285 { 5285 {
5286 mutex_lock(&zonelists_mutex); 5286 mutex_lock(&zonelists_mutex);
5287 __setup_per_zone_wmarks(); 5287 __setup_per_zone_wmarks();
5288 mutex_unlock(&zonelists_mutex); 5288 mutex_unlock(&zonelists_mutex);
5289 } 5289 }
5290 5290
5291 /* 5291 /*
5292 * The inactive anon list should be small enough that the VM never has to 5292 * The inactive anon list should be small enough that the VM never has to
5293 * do too much work, but large enough that each inactive page has a chance 5293 * do too much work, but large enough that each inactive page has a chance
5294 * to be referenced again before it is swapped out. 5294 * to be referenced again before it is swapped out.
5295 * 5295 *
5296 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5296 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5297 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5297 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5298 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5298 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5299 * the anonymous pages are kept on the inactive list. 5299 * the anonymous pages are kept on the inactive list.
5300 * 5300 *
5301 * total target max 5301 * total target max
5302 * memory ratio inactive anon 5302 * memory ratio inactive anon
5303 * ------------------------------------- 5303 * -------------------------------------
5304 * 10MB 1 5MB 5304 * 10MB 1 5MB
5305 * 100MB 1 50MB 5305 * 100MB 1 50MB
5306 * 1GB 3 250MB 5306 * 1GB 3 250MB
5307 * 10GB 10 0.9GB 5307 * 10GB 10 0.9GB
5308 * 100GB 31 3GB 5308 * 100GB 31 3GB
5309 * 1TB 101 10GB 5309 * 1TB 101 10GB
5310 * 10TB 320 32GB 5310 * 10TB 320 32GB
5311 */ 5311 */
5312 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5312 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5313 { 5313 {
5314 unsigned int gb, ratio; 5314 unsigned int gb, ratio;
5315 5315
5316 /* Zone size in gigabytes */ 5316 /* Zone size in gigabytes */
5317 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5317 gb = zone->present_pages >> (30 - PAGE_SHIFT);
5318 if (gb) 5318 if (gb)
5319 ratio = int_sqrt(10 * gb); 5319 ratio = int_sqrt(10 * gb);
5320 else 5320 else
5321 ratio = 1; 5321 ratio = 1;
5322 5322
5323 zone->inactive_ratio = ratio; 5323 zone->inactive_ratio = ratio;
5324 } 5324 }
5325 5325
5326 static void __meminit setup_per_zone_inactive_ratio(void) 5326 static void __meminit setup_per_zone_inactive_ratio(void)
5327 { 5327 {
5328 struct zone *zone; 5328 struct zone *zone;
5329 5329
5330 for_each_zone(zone) 5330 for_each_zone(zone)
5331 calculate_zone_inactive_ratio(zone); 5331 calculate_zone_inactive_ratio(zone);
5332 } 5332 }
5333 5333
5334 /* 5334 /*
5335 * Initialise min_free_kbytes. 5335 * Initialise min_free_kbytes.
5336 * 5336 *
5337 * For small machines we want it small (128k min). For large machines 5337 * For small machines we want it small (128k min). For large machines
5338 * we want it large (64MB max). But it is not linear, because network 5338 * we want it large (64MB max). But it is not linear, because network
5339 * bandwidth does not increase linearly with machine size. We use 5339 * bandwidth does not increase linearly with machine size. We use
5340 * 5340 *
5341 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5341 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5342 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5342 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5343 * 5343 *
5344 * which yields 5344 * which yields
5345 * 5345 *
5346 * 16MB: 512k 5346 * 16MB: 512k
5347 * 32MB: 724k 5347 * 32MB: 724k
5348 * 64MB: 1024k 5348 * 64MB: 1024k
5349 * 128MB: 1448k 5349 * 128MB: 1448k
5350 * 256MB: 2048k 5350 * 256MB: 2048k
5351 * 512MB: 2896k 5351 * 512MB: 2896k
5352 * 1024MB: 4096k 5352 * 1024MB: 4096k
5353 * 2048MB: 5792k 5353 * 2048MB: 5792k
5354 * 4096MB: 8192k 5354 * 4096MB: 8192k
5355 * 8192MB: 11584k 5355 * 8192MB: 11584k
5356 * 16384MB: 16384k 5356 * 16384MB: 16384k
5357 */ 5357 */
5358 int __meminit init_per_zone_wmark_min(void) 5358 int __meminit init_per_zone_wmark_min(void)
5359 { 5359 {
5360 unsigned long lowmem_kbytes; 5360 unsigned long lowmem_kbytes;
5361 5361
5362 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5362 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5363 5363
5364 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5364 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5365 if (min_free_kbytes < 128) 5365 if (min_free_kbytes < 128)
5366 min_free_kbytes = 128; 5366 min_free_kbytes = 128;
5367 if (min_free_kbytes > 65536) 5367 if (min_free_kbytes > 65536)
5368 min_free_kbytes = 65536; 5368 min_free_kbytes = 65536;
5369 setup_per_zone_wmarks(); 5369 setup_per_zone_wmarks();
5370 refresh_zone_stat_thresholds(); 5370 refresh_zone_stat_thresholds();
5371 setup_per_zone_lowmem_reserve(); 5371 setup_per_zone_lowmem_reserve();
5372 setup_per_zone_inactive_ratio(); 5372 setup_per_zone_inactive_ratio();
5373 return 0; 5373 return 0;
5374 } 5374 }
5375 module_init(init_per_zone_wmark_min) 5375 module_init(init_per_zone_wmark_min)
5376 5376
5377 /* 5377 /*
5378 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5378 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5379 * that we can call two helper functions whenever min_free_kbytes 5379 * that we can call two helper functions whenever min_free_kbytes
5380 * changes. 5380 * changes.
5381 */ 5381 */
5382 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5382 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5383 void __user *buffer, size_t *length, loff_t *ppos) 5383 void __user *buffer, size_t *length, loff_t *ppos)
5384 { 5384 {
5385 proc_dointvec(table, write, buffer, length, ppos); 5385 proc_dointvec(table, write, buffer, length, ppos);
5386 if (write) 5386 if (write)
5387 setup_per_zone_wmarks(); 5387 setup_per_zone_wmarks();
5388 return 0; 5388 return 0;
5389 } 5389 }
5390 5390
5391 #ifdef CONFIG_NUMA 5391 #ifdef CONFIG_NUMA
5392 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5392 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5393 void __user *buffer, size_t *length, loff_t *ppos) 5393 void __user *buffer, size_t *length, loff_t *ppos)
5394 { 5394 {
5395 struct zone *zone; 5395 struct zone *zone;
5396 int rc; 5396 int rc;
5397 5397
5398 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5398 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5399 if (rc) 5399 if (rc)
5400 return rc; 5400 return rc;
5401 5401
5402 for_each_zone(zone) 5402 for_each_zone(zone)
5403 zone->min_unmapped_pages = (zone->present_pages * 5403 zone->min_unmapped_pages = (zone->present_pages *
5404 sysctl_min_unmapped_ratio) / 100; 5404 sysctl_min_unmapped_ratio) / 100;
5405 return 0; 5405 return 0;
5406 } 5406 }
5407 5407
5408 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5408 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5409 void __user *buffer, size_t *length, loff_t *ppos) 5409 void __user *buffer, size_t *length, loff_t *ppos)
5410 { 5410 {
5411 struct zone *zone; 5411 struct zone *zone;
5412 int rc; 5412 int rc;
5413 5413
5414 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5414 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5415 if (rc) 5415 if (rc)
5416 return rc; 5416 return rc;
5417 5417
5418 for_each_zone(zone) 5418 for_each_zone(zone)
5419 zone->min_slab_pages = (zone->present_pages * 5419 zone->min_slab_pages = (zone->present_pages *
5420 sysctl_min_slab_ratio) / 100; 5420 sysctl_min_slab_ratio) / 100;
5421 return 0; 5421 return 0;
5422 } 5422 }
5423 #endif 5423 #endif
5424 5424
5425 /* 5425 /*
5426 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5426 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5427 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5427 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5428 * whenever sysctl_lowmem_reserve_ratio changes. 5428 * whenever sysctl_lowmem_reserve_ratio changes.
5429 * 5429 *
5430 * The reserve ratio obviously has absolutely no relation with the 5430 * The reserve ratio obviously has absolutely no relation with the
5431 * minimum watermarks. The lowmem reserve ratio can only make sense 5431 * minimum watermarks. The lowmem reserve ratio can only make sense
5432 * if in function of the boot time zone sizes. 5432 * if in function of the boot time zone sizes.
5433 */ 5433 */
5434 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5434 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5435 void __user *buffer, size_t *length, loff_t *ppos) 5435 void __user *buffer, size_t *length, loff_t *ppos)
5436 { 5436 {
5437 proc_dointvec_minmax(table, write, buffer, length, ppos); 5437 proc_dointvec_minmax(table, write, buffer, length, ppos);
5438 setup_per_zone_lowmem_reserve(); 5438 setup_per_zone_lowmem_reserve();
5439 return 0; 5439 return 0;
5440 } 5440 }
5441 5441
5442 /* 5442 /*
5443 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5443 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5444 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5444 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5445 * can have before it gets flushed back to buddy allocator. 5445 * can have before it gets flushed back to buddy allocator.
5446 */ 5446 */
5447 5447
5448 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5448 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5449 void __user *buffer, size_t *length, loff_t *ppos) 5449 void __user *buffer, size_t *length, loff_t *ppos)
5450 { 5450 {
5451 struct zone *zone; 5451 struct zone *zone;
5452 unsigned int cpu; 5452 unsigned int cpu;
5453 int ret; 5453 int ret;
5454 5454
5455 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5455 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5456 if (!write || (ret < 0)) 5456 if (!write || (ret < 0))
5457 return ret; 5457 return ret;
5458 for_each_populated_zone(zone) { 5458 for_each_populated_zone(zone) {
5459 for_each_possible_cpu(cpu) { 5459 for_each_possible_cpu(cpu) {
5460 unsigned long high; 5460 unsigned long high;
5461 high = zone->present_pages / percpu_pagelist_fraction; 5461 high = zone->present_pages / percpu_pagelist_fraction;
5462 setup_pagelist_highmark( 5462 setup_pagelist_highmark(
5463 per_cpu_ptr(zone->pageset, cpu), high); 5463 per_cpu_ptr(zone->pageset, cpu), high);
5464 } 5464 }
5465 } 5465 }
5466 return 0; 5466 return 0;
5467 } 5467 }
5468 5468
5469 int hashdist = HASHDIST_DEFAULT; 5469 int hashdist = HASHDIST_DEFAULT;
5470 5470
5471 #ifdef CONFIG_NUMA 5471 #ifdef CONFIG_NUMA
5472 static int __init set_hashdist(char *str) 5472 static int __init set_hashdist(char *str)
5473 { 5473 {
5474 if (!str) 5474 if (!str)
5475 return 0; 5475 return 0;
5476 hashdist = simple_strtoul(str, &str, 0); 5476 hashdist = simple_strtoul(str, &str, 0);
5477 return 1; 5477 return 1;
5478 } 5478 }
5479 __setup("hashdist=", set_hashdist); 5479 __setup("hashdist=", set_hashdist);
5480 #endif 5480 #endif
5481 5481
5482 /* 5482 /*
5483 * allocate a large system hash table from bootmem 5483 * allocate a large system hash table from bootmem
5484 * - it is assumed that the hash table must contain an exact power-of-2 5484 * - it is assumed that the hash table must contain an exact power-of-2
5485 * quantity of entries 5485 * quantity of entries
5486 * - limit is the number of hash buckets, not the total allocation size 5486 * - limit is the number of hash buckets, not the total allocation size
5487 */ 5487 */
5488 void *__init alloc_large_system_hash(const char *tablename, 5488 void *__init alloc_large_system_hash(const char *tablename,
5489 unsigned long bucketsize, 5489 unsigned long bucketsize,
5490 unsigned long numentries, 5490 unsigned long numentries,
5491 int scale, 5491 int scale,
5492 int flags, 5492 int flags,
5493 unsigned int *_hash_shift, 5493 unsigned int *_hash_shift,
5494 unsigned int *_hash_mask, 5494 unsigned int *_hash_mask,
5495 unsigned long low_limit, 5495 unsigned long low_limit,
5496 unsigned long high_limit) 5496 unsigned long high_limit)
5497 { 5497 {
5498 unsigned long long max = high_limit; 5498 unsigned long long max = high_limit;
5499 unsigned long log2qty, size; 5499 unsigned long log2qty, size;
5500 void *table = NULL; 5500 void *table = NULL;
5501 5501
5502 /* allow the kernel cmdline to have a say */ 5502 /* allow the kernel cmdline to have a say */
5503 if (!numentries) { 5503 if (!numentries) {
5504 /* round applicable memory size up to nearest megabyte */ 5504 /* round applicable memory size up to nearest megabyte */
5505 numentries = nr_kernel_pages; 5505 numentries = nr_kernel_pages;
5506 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 5506 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
5507 numentries >>= 20 - PAGE_SHIFT; 5507 numentries >>= 20 - PAGE_SHIFT;
5508 numentries <<= 20 - PAGE_SHIFT; 5508 numentries <<= 20 - PAGE_SHIFT;
5509 5509
5510 /* limit to 1 bucket per 2^scale bytes of low memory */ 5510 /* limit to 1 bucket per 2^scale bytes of low memory */
5511 if (scale > PAGE_SHIFT) 5511 if (scale > PAGE_SHIFT)
5512 numentries >>= (scale - PAGE_SHIFT); 5512 numentries >>= (scale - PAGE_SHIFT);
5513 else 5513 else
5514 numentries <<= (PAGE_SHIFT - scale); 5514 numentries <<= (PAGE_SHIFT - scale);
5515 5515
5516 /* Make sure we've got at least a 0-order allocation.. */ 5516 /* Make sure we've got at least a 0-order allocation.. */
5517 if (unlikely(flags & HASH_SMALL)) { 5517 if (unlikely(flags & HASH_SMALL)) {
5518 /* Makes no sense without HASH_EARLY */ 5518 /* Makes no sense without HASH_EARLY */
5519 WARN_ON(!(flags & HASH_EARLY)); 5519 WARN_ON(!(flags & HASH_EARLY));
5520 if (!(numentries >> *_hash_shift)) { 5520 if (!(numentries >> *_hash_shift)) {
5521 numentries = 1UL << *_hash_shift; 5521 numentries = 1UL << *_hash_shift;
5522 BUG_ON(!numentries); 5522 BUG_ON(!numentries);
5523 } 5523 }
5524 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5524 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5525 numentries = PAGE_SIZE / bucketsize; 5525 numentries = PAGE_SIZE / bucketsize;
5526 } 5526 }
5527 numentries = roundup_pow_of_two(numentries); 5527 numentries = roundup_pow_of_two(numentries);
5528 5528
5529 /* limit allocation size to 1/16 total memory by default */ 5529 /* limit allocation size to 1/16 total memory by default */
5530 if (max == 0) { 5530 if (max == 0) {
5531 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5531 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5532 do_div(max, bucketsize); 5532 do_div(max, bucketsize);
5533 } 5533 }
5534 max = min(max, 0x80000000ULL); 5534 max = min(max, 0x80000000ULL);
5535 5535
5536 if (numentries < low_limit) 5536 if (numentries < low_limit)
5537 numentries = low_limit; 5537 numentries = low_limit;
5538 if (numentries > max) 5538 if (numentries > max)
5539 numentries = max; 5539 numentries = max;
5540 5540
5541 log2qty = ilog2(numentries); 5541 log2qty = ilog2(numentries);
5542 5542
5543 do { 5543 do {
5544 size = bucketsize << log2qty; 5544 size = bucketsize << log2qty;
5545 if (flags & HASH_EARLY) 5545 if (flags & HASH_EARLY)
5546 table = alloc_bootmem_nopanic(size); 5546 table = alloc_bootmem_nopanic(size);
5547 else if (hashdist) 5547 else if (hashdist)
5548 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5548 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5549 else { 5549 else {
5550 /* 5550 /*
5551 * If bucketsize is not a power-of-two, we may free 5551 * If bucketsize is not a power-of-two, we may free
5552 * some pages at the end of hash table which 5552 * some pages at the end of hash table which
5553 * alloc_pages_exact() automatically does 5553 * alloc_pages_exact() automatically does
5554 */ 5554 */
5555 if (get_order(size) < MAX_ORDER) { 5555 if (get_order(size) < MAX_ORDER) {
5556 table = alloc_pages_exact(size, GFP_ATOMIC); 5556 table = alloc_pages_exact(size, GFP_ATOMIC);
5557 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5557 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5558 } 5558 }
5559 } 5559 }
5560 } while (!table && size > PAGE_SIZE && --log2qty); 5560 } while (!table && size > PAGE_SIZE && --log2qty);
5561 5561
5562 if (!table) 5562 if (!table)
5563 panic("Failed to allocate %s hash table\n", tablename); 5563 panic("Failed to allocate %s hash table\n", tablename);
5564 5564
5565 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5565 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5566 tablename, 5566 tablename,
5567 (1UL << log2qty), 5567 (1UL << log2qty),
5568 ilog2(size) - PAGE_SHIFT, 5568 ilog2(size) - PAGE_SHIFT,
5569 size); 5569 size);
5570 5570
5571 if (_hash_shift) 5571 if (_hash_shift)
5572 *_hash_shift = log2qty; 5572 *_hash_shift = log2qty;
5573 if (_hash_mask) 5573 if (_hash_mask)
5574 *_hash_mask = (1 << log2qty) - 1; 5574 *_hash_mask = (1 << log2qty) - 1;
5575 5575
5576 return table; 5576 return table;
5577 } 5577 }
5578 5578
5579 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5579 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5580 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5580 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5581 unsigned long pfn) 5581 unsigned long pfn)
5582 { 5582 {
5583 #ifdef CONFIG_SPARSEMEM 5583 #ifdef CONFIG_SPARSEMEM
5584 return __pfn_to_section(pfn)->pageblock_flags; 5584 return __pfn_to_section(pfn)->pageblock_flags;
5585 #else 5585 #else
5586 return zone->pageblock_flags; 5586 return zone->pageblock_flags;
5587 #endif /* CONFIG_SPARSEMEM */ 5587 #endif /* CONFIG_SPARSEMEM */
5588 } 5588 }
5589 5589
5590 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5590 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5591 { 5591 {
5592 #ifdef CONFIG_SPARSEMEM 5592 #ifdef CONFIG_SPARSEMEM
5593 pfn &= (PAGES_PER_SECTION-1); 5593 pfn &= (PAGES_PER_SECTION-1);
5594 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5594 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5595 #else 5595 #else
5596 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 5596 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5597 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5597 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5598 #endif /* CONFIG_SPARSEMEM */ 5598 #endif /* CONFIG_SPARSEMEM */
5599 } 5599 }
5600 5600
5601 /** 5601 /**
5602 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5602 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5603 * @page: The page within the block of interest 5603 * @page: The page within the block of interest
5604 * @start_bitidx: The first bit of interest to retrieve 5604 * @start_bitidx: The first bit of interest to retrieve
5605 * @end_bitidx: The last bit of interest 5605 * @end_bitidx: The last bit of interest
5606 * returns pageblock_bits flags 5606 * returns pageblock_bits flags
5607 */ 5607 */
5608 unsigned long get_pageblock_flags_group(struct page *page, 5608 unsigned long get_pageblock_flags_group(struct page *page,
5609 int start_bitidx, int end_bitidx) 5609 int start_bitidx, int end_bitidx)
5610 { 5610 {
5611 struct zone *zone; 5611 struct zone *zone;
5612 unsigned long *bitmap; 5612 unsigned long *bitmap;
5613 unsigned long pfn, bitidx; 5613 unsigned long pfn, bitidx;
5614 unsigned long flags = 0; 5614 unsigned long flags = 0;
5615 unsigned long value = 1; 5615 unsigned long value = 1;
5616 5616
5617 zone = page_zone(page); 5617 zone = page_zone(page);
5618 pfn = page_to_pfn(page); 5618 pfn = page_to_pfn(page);
5619 bitmap = get_pageblock_bitmap(zone, pfn); 5619 bitmap = get_pageblock_bitmap(zone, pfn);
5620 bitidx = pfn_to_bitidx(zone, pfn); 5620 bitidx = pfn_to_bitidx(zone, pfn);
5621 5621
5622 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5622 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5623 if (test_bit(bitidx + start_bitidx, bitmap)) 5623 if (test_bit(bitidx + start_bitidx, bitmap))
5624 flags |= value; 5624 flags |= value;
5625 5625
5626 return flags; 5626 return flags;
5627 } 5627 }
5628 5628
5629 /** 5629 /**
5630 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 5630 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5631 * @page: The page within the block of interest 5631 * @page: The page within the block of interest
5632 * @start_bitidx: The first bit of interest 5632 * @start_bitidx: The first bit of interest
5633 * @end_bitidx: The last bit of interest 5633 * @end_bitidx: The last bit of interest
5634 * @flags: The flags to set 5634 * @flags: The flags to set
5635 */ 5635 */
5636 void set_pageblock_flags_group(struct page *page, unsigned long flags, 5636 void set_pageblock_flags_group(struct page *page, unsigned long flags,
5637 int start_bitidx, int end_bitidx) 5637 int start_bitidx, int end_bitidx)
5638 { 5638 {
5639 struct zone *zone; 5639 struct zone *zone;
5640 unsigned long *bitmap; 5640 unsigned long *bitmap;
5641 unsigned long pfn, bitidx; 5641 unsigned long pfn, bitidx;
5642 unsigned long value = 1; 5642 unsigned long value = 1;
5643 5643
5644 zone = page_zone(page); 5644 zone = page_zone(page);
5645 pfn = page_to_pfn(page); 5645 pfn = page_to_pfn(page);
5646 bitmap = get_pageblock_bitmap(zone, pfn); 5646 bitmap = get_pageblock_bitmap(zone, pfn);
5647 bitidx = pfn_to_bitidx(zone, pfn); 5647 bitidx = pfn_to_bitidx(zone, pfn);
5648 VM_BUG_ON(pfn < zone->zone_start_pfn); 5648 VM_BUG_ON(pfn < zone->zone_start_pfn);
5649 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); 5649 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5650 5650
5651 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5651 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5652 if (flags & value) 5652 if (flags & value)
5653 __set_bit(bitidx + start_bitidx, bitmap); 5653 __set_bit(bitidx + start_bitidx, bitmap);
5654 else 5654 else
5655 __clear_bit(bitidx + start_bitidx, bitmap); 5655 __clear_bit(bitidx + start_bitidx, bitmap);
5656 } 5656 }
5657 5657
5658 /* 5658 /*
5659 * This function checks whether pageblock includes unmovable pages or not. 5659 * This function checks whether pageblock includes unmovable pages or not.
5660 * If @count is not zero, it is okay to include less @count unmovable pages 5660 * If @count is not zero, it is okay to include less @count unmovable pages
5661 * 5661 *
5662 * PageLRU check wihtout isolation or lru_lock could race so that 5662 * PageLRU check wihtout isolation or lru_lock could race so that
5663 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5663 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5664 * expect this function should be exact. 5664 * expect this function should be exact.
5665 */ 5665 */
5666 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 5666 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5667 bool skip_hwpoisoned_pages) 5667 bool skip_hwpoisoned_pages)
5668 { 5668 {
5669 unsigned long pfn, iter, found; 5669 unsigned long pfn, iter, found;
5670 int mt; 5670 int mt;
5671 5671
5672 /* 5672 /*
5673 * For avoiding noise data, lru_add_drain_all() should be called 5673 * For avoiding noise data, lru_add_drain_all() should be called
5674 * If ZONE_MOVABLE, the zone never contains unmovable pages 5674 * If ZONE_MOVABLE, the zone never contains unmovable pages
5675 */ 5675 */
5676 if (zone_idx(zone) == ZONE_MOVABLE) 5676 if (zone_idx(zone) == ZONE_MOVABLE)
5677 return false; 5677 return false;
5678 mt = get_pageblock_migratetype(page); 5678 mt = get_pageblock_migratetype(page);
5679 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5679 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5680 return false; 5680 return false;
5681 5681
5682 pfn = page_to_pfn(page); 5682 pfn = page_to_pfn(page);
5683 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5683 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5684 unsigned long check = pfn + iter; 5684 unsigned long check = pfn + iter;
5685 5685
5686 if (!pfn_valid_within(check)) 5686 if (!pfn_valid_within(check))
5687 continue; 5687 continue;
5688 5688
5689 page = pfn_to_page(check); 5689 page = pfn_to_page(check);
5690 /* 5690 /*
5691 * We can't use page_count without pin a page 5691 * We can't use page_count without pin a page
5692 * because another CPU can free compound page. 5692 * because another CPU can free compound page.
5693 * This check already skips compound tails of THP 5693 * This check already skips compound tails of THP
5694 * because their page->_count is zero at all time. 5694 * because their page->_count is zero at all time.
5695 */ 5695 */
5696 if (!atomic_read(&page->_count)) { 5696 if (!atomic_read(&page->_count)) {
5697 if (PageBuddy(page)) 5697 if (PageBuddy(page))
5698 iter += (1 << page_order(page)) - 1; 5698 iter += (1 << page_order(page)) - 1;
5699 continue; 5699 continue;
5700 } 5700 }
5701 5701
5702 /* 5702 /*
5703 * The HWPoisoned page may be not in buddy system, and 5703 * The HWPoisoned page may be not in buddy system, and
5704 * page_count() is not 0. 5704 * page_count() is not 0.
5705 */ 5705 */
5706 if (skip_hwpoisoned_pages && PageHWPoison(page)) 5706 if (skip_hwpoisoned_pages && PageHWPoison(page))
5707 continue; 5707 continue;
5708 5708
5709 if (!PageLRU(page)) 5709 if (!PageLRU(page))
5710 found++; 5710 found++;
5711 /* 5711 /*
5712 * If there are RECLAIMABLE pages, we need to check it. 5712 * If there are RECLAIMABLE pages, we need to check it.
5713 * But now, memory offline itself doesn't call shrink_slab() 5713 * But now, memory offline itself doesn't call shrink_slab()
5714 * and it still to be fixed. 5714 * and it still to be fixed.
5715 */ 5715 */
5716 /* 5716 /*
5717 * If the page is not RAM, page_count()should be 0. 5717 * If the page is not RAM, page_count()should be 0.
5718 * we don't need more check. This is an _used_ not-movable page. 5718 * we don't need more check. This is an _used_ not-movable page.
5719 * 5719 *
5720 * The problematic thing here is PG_reserved pages. PG_reserved 5720 * The problematic thing here is PG_reserved pages. PG_reserved
5721 * is set to both of a memory hole page and a _used_ kernel 5721 * is set to both of a memory hole page and a _used_ kernel
5722 * page at boot. 5722 * page at boot.
5723 */ 5723 */
5724 if (found > count) 5724 if (found > count)
5725 return true; 5725 return true;
5726 } 5726 }
5727 return false; 5727 return false;
5728 } 5728 }
5729 5729
5730 bool is_pageblock_removable_nolock(struct page *page) 5730 bool is_pageblock_removable_nolock(struct page *page)
5731 { 5731 {
5732 struct zone *zone; 5732 struct zone *zone;
5733 unsigned long pfn; 5733 unsigned long pfn;
5734 5734
5735 /* 5735 /*
5736 * We have to be careful here because we are iterating over memory 5736 * We have to be careful here because we are iterating over memory
5737 * sections which are not zone aware so we might end up outside of 5737 * sections which are not zone aware so we might end up outside of
5738 * the zone but still within the section. 5738 * the zone but still within the section.
5739 * We have to take care about the node as well. If the node is offline 5739 * We have to take care about the node as well. If the node is offline
5740 * its NODE_DATA will be NULL - see page_zone. 5740 * its NODE_DATA will be NULL - see page_zone.
5741 */ 5741 */
5742 if (!node_online(page_to_nid(page))) 5742 if (!node_online(page_to_nid(page)))
5743 return false; 5743 return false;
5744 5744
5745 zone = page_zone(page); 5745 zone = page_zone(page);
5746 pfn = page_to_pfn(page); 5746 pfn = page_to_pfn(page);
5747 if (zone->zone_start_pfn > pfn || 5747 if (zone->zone_start_pfn > pfn ||
5748 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5748 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5749 return false; 5749 return false;
5750 5750
5751 return !has_unmovable_pages(zone, page, 0, true); 5751 return !has_unmovable_pages(zone, page, 0, true);
5752 } 5752 }
5753 5753
5754 #ifdef CONFIG_CMA 5754 #ifdef CONFIG_CMA
5755 5755
5756 static unsigned long pfn_max_align_down(unsigned long pfn) 5756 static unsigned long pfn_max_align_down(unsigned long pfn)
5757 { 5757 {
5758 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 5758 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5759 pageblock_nr_pages) - 1); 5759 pageblock_nr_pages) - 1);
5760 } 5760 }
5761 5761
5762 static unsigned long pfn_max_align_up(unsigned long pfn) 5762 static unsigned long pfn_max_align_up(unsigned long pfn)
5763 { 5763 {
5764 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 5764 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5765 pageblock_nr_pages)); 5765 pageblock_nr_pages));
5766 } 5766 }
5767 5767
5768 /* [start, end) must belong to a single zone. */ 5768 /* [start, end) must belong to a single zone. */
5769 static int __alloc_contig_migrate_range(struct compact_control *cc, 5769 static int __alloc_contig_migrate_range(struct compact_control *cc,
5770 unsigned long start, unsigned long end) 5770 unsigned long start, unsigned long end)
5771 { 5771 {
5772 /* This function is based on compact_zone() from compaction.c. */ 5772 /* This function is based on compact_zone() from compaction.c. */
5773 unsigned long nr_reclaimed; 5773 unsigned long nr_reclaimed;
5774 unsigned long pfn = start; 5774 unsigned long pfn = start;
5775 unsigned int tries = 0; 5775 unsigned int tries = 0;
5776 int ret = 0; 5776 int ret = 0;
5777 5777
5778 migrate_prep(); 5778 migrate_prep();
5779 5779
5780 while (pfn < end || !list_empty(&cc->migratepages)) { 5780 while (pfn < end || !list_empty(&cc->migratepages)) {
5781 if (fatal_signal_pending(current)) { 5781 if (fatal_signal_pending(current)) {
5782 ret = -EINTR; 5782 ret = -EINTR;
5783 break; 5783 break;
5784 } 5784 }
5785 5785
5786 if (list_empty(&cc->migratepages)) { 5786 if (list_empty(&cc->migratepages)) {
5787 cc->nr_migratepages = 0; 5787 cc->nr_migratepages = 0;
5788 pfn = isolate_migratepages_range(cc->zone, cc, 5788 pfn = isolate_migratepages_range(cc->zone, cc,
5789 pfn, end, true); 5789 pfn, end, true);
5790 if (!pfn) { 5790 if (!pfn) {
5791 ret = -EINTR; 5791 ret = -EINTR;
5792 break; 5792 break;
5793 } 5793 }
5794 tries = 0; 5794 tries = 0;
5795 } else if (++tries == 5) { 5795 } else if (++tries == 5) {
5796 ret = ret < 0 ? ret : -EBUSY; 5796 ret = ret < 0 ? ret : -EBUSY;
5797 break; 5797 break;
5798 } 5798 }
5799 5799
5800 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 5800 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
5801 &cc->migratepages); 5801 &cc->migratepages);
5802 cc->nr_migratepages -= nr_reclaimed; 5802 cc->nr_migratepages -= nr_reclaimed;
5803 5803
5804 ret = migrate_pages(&cc->migratepages, 5804 ret = migrate_pages(&cc->migratepages,
5805 alloc_migrate_target, 5805 alloc_migrate_target,
5806 0, false, MIGRATE_SYNC, 5806 0, false, MIGRATE_SYNC,
5807 MR_CMA); 5807 MR_CMA);
5808 } 5808 }
5809 5809 if (ret < 0) {
5810 putback_movable_pages(&cc->migratepages); 5810 putback_movable_pages(&cc->migratepages);
5811 return ret > 0 ? 0 : ret; 5811 return ret;
5812 }
5813 return 0;
5812 } 5814 }
5813 5815
5814 /** 5816 /**
5815 * alloc_contig_range() -- tries to allocate given range of pages 5817 * alloc_contig_range() -- tries to allocate given range of pages
5816 * @start: start PFN to allocate 5818 * @start: start PFN to allocate
5817 * @end: one-past-the-last PFN to allocate 5819 * @end: one-past-the-last PFN to allocate
5818 * @migratetype: migratetype of the underlaying pageblocks (either 5820 * @migratetype: migratetype of the underlaying pageblocks (either
5819 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 5821 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5820 * in range must have the same migratetype and it must 5822 * in range must have the same migratetype and it must
5821 * be either of the two. 5823 * be either of the two.
5822 * 5824 *
5823 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 5825 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5824 * aligned, however it's the caller's responsibility to guarantee that 5826 * aligned, however it's the caller's responsibility to guarantee that
5825 * we are the only thread that changes migrate type of pageblocks the 5827 * we are the only thread that changes migrate type of pageblocks the
5826 * pages fall in. 5828 * pages fall in.
5827 * 5829 *
5828 * The PFN range must belong to a single zone. 5830 * The PFN range must belong to a single zone.
5829 * 5831 *
5830 * Returns zero on success or negative error code. On success all 5832 * Returns zero on success or negative error code. On success all
5831 * pages which PFN is in [start, end) are allocated for the caller and 5833 * pages which PFN is in [start, end) are allocated for the caller and
5832 * need to be freed with free_contig_range(). 5834 * need to be freed with free_contig_range().
5833 */ 5835 */
5834 int alloc_contig_range(unsigned long start, unsigned long end, 5836 int alloc_contig_range(unsigned long start, unsigned long end,
5835 unsigned migratetype) 5837 unsigned migratetype)
5836 { 5838 {
5837 unsigned long outer_start, outer_end; 5839 unsigned long outer_start, outer_end;
5838 int ret = 0, order; 5840 int ret = 0, order;
5839 5841
5840 struct compact_control cc = { 5842 struct compact_control cc = {
5841 .nr_migratepages = 0, 5843 .nr_migratepages = 0,
5842 .order = -1, 5844 .order = -1,
5843 .zone = page_zone(pfn_to_page(start)), 5845 .zone = page_zone(pfn_to_page(start)),
5844 .sync = true, 5846 .sync = true,
5845 .ignore_skip_hint = true, 5847 .ignore_skip_hint = true,
5846 }; 5848 };
5847 INIT_LIST_HEAD(&cc.migratepages); 5849 INIT_LIST_HEAD(&cc.migratepages);
5848 5850
5849 /* 5851 /*
5850 * What we do here is we mark all pageblocks in range as 5852 * What we do here is we mark all pageblocks in range as
5851 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5853 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5852 * have different sizes, and due to the way page allocator 5854 * have different sizes, and due to the way page allocator
5853 * work, we align the range to biggest of the two pages so 5855 * work, we align the range to biggest of the two pages so
5854 * that page allocator won't try to merge buddies from 5856 * that page allocator won't try to merge buddies from
5855 * different pageblocks and change MIGRATE_ISOLATE to some 5857 * different pageblocks and change MIGRATE_ISOLATE to some
5856 * other migration type. 5858 * other migration type.
5857 * 5859 *
5858 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 5860 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5859 * migrate the pages from an unaligned range (ie. pages that 5861 * migrate the pages from an unaligned range (ie. pages that
5860 * we are interested in). This will put all the pages in 5862 * we are interested in). This will put all the pages in
5861 * range back to page allocator as MIGRATE_ISOLATE. 5863 * range back to page allocator as MIGRATE_ISOLATE.
5862 * 5864 *
5863 * When this is done, we take the pages in range from page 5865 * When this is done, we take the pages in range from page
5864 * allocator removing them from the buddy system. This way 5866 * allocator removing them from the buddy system. This way
5865 * page allocator will never consider using them. 5867 * page allocator will never consider using them.
5866 * 5868 *
5867 * This lets us mark the pageblocks back as 5869 * This lets us mark the pageblocks back as
5868 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 5870 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5869 * aligned range but not in the unaligned, original range are 5871 * aligned range but not in the unaligned, original range are
5870 * put back to page allocator so that buddy can use them. 5872 * put back to page allocator so that buddy can use them.
5871 */ 5873 */
5872 5874
5873 ret = start_isolate_page_range(pfn_max_align_down(start), 5875 ret = start_isolate_page_range(pfn_max_align_down(start),
5874 pfn_max_align_up(end), migratetype, 5876 pfn_max_align_up(end), migratetype,
5875 false); 5877 false);
5876 if (ret) 5878 if (ret)
5877 return ret; 5879 return ret;
5878 5880
5879 ret = __alloc_contig_migrate_range(&cc, start, end); 5881 ret = __alloc_contig_migrate_range(&cc, start, end);
5880 if (ret) 5882 if (ret)
5881 goto done; 5883 goto done;
5882 5884
5883 /* 5885 /*
5884 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 5886 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5885 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 5887 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5886 * more, all pages in [start, end) are free in page allocator. 5888 * more, all pages in [start, end) are free in page allocator.
5887 * What we are going to do is to allocate all pages from 5889 * What we are going to do is to allocate all pages from
5888 * [start, end) (that is remove them from page allocator). 5890 * [start, end) (that is remove them from page allocator).
5889 * 5891 *
5890 * The only problem is that pages at the beginning and at the 5892 * The only problem is that pages at the beginning and at the
5891 * end of interesting range may be not aligned with pages that 5893 * end of interesting range may be not aligned with pages that
5892 * page allocator holds, ie. they can be part of higher order 5894 * page allocator holds, ie. they can be part of higher order
5893 * pages. Because of this, we reserve the bigger range and 5895 * pages. Because of this, we reserve the bigger range and
5894 * once this is done free the pages we are not interested in. 5896 * once this is done free the pages we are not interested in.
5895 * 5897 *
5896 * We don't have to hold zone->lock here because the pages are 5898 * We don't have to hold zone->lock here because the pages are
5897 * isolated thus they won't get removed from buddy. 5899 * isolated thus they won't get removed from buddy.
5898 */ 5900 */
5899 5901
5900 lru_add_drain_all(); 5902 lru_add_drain_all();
5901 drain_all_pages(); 5903 drain_all_pages();
5902 5904
5903 order = 0; 5905 order = 0;
5904 outer_start = start; 5906 outer_start = start;
5905 while (!PageBuddy(pfn_to_page(outer_start))) { 5907 while (!PageBuddy(pfn_to_page(outer_start))) {
5906 if (++order >= MAX_ORDER) { 5908 if (++order >= MAX_ORDER) {
5907 ret = -EBUSY; 5909 ret = -EBUSY;
5908 goto done; 5910 goto done;
5909 } 5911 }
5910 outer_start &= ~0UL << order; 5912 outer_start &= ~0UL << order;
5911 } 5913 }
5912 5914
5913 /* Make sure the range is really isolated. */ 5915 /* Make sure the range is really isolated. */
5914 if (test_pages_isolated(outer_start, end, false)) { 5916 if (test_pages_isolated(outer_start, end, false)) {
5915 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5917 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5916 outer_start, end); 5918 outer_start, end);
5917 ret = -EBUSY; 5919 ret = -EBUSY;
5918 goto done; 5920 goto done;
5919 } 5921 }
5920 5922
5921 5923
5922 /* Grab isolated pages from freelists. */ 5924 /* Grab isolated pages from freelists. */
5923 outer_end = isolate_freepages_range(&cc, outer_start, end); 5925 outer_end = isolate_freepages_range(&cc, outer_start, end);
5924 if (!outer_end) { 5926 if (!outer_end) {
5925 ret = -EBUSY; 5927 ret = -EBUSY;
5926 goto done; 5928 goto done;
5927 } 5929 }
5928 5930
5929 /* Free head and tail (if any) */ 5931 /* Free head and tail (if any) */
5930 if (start != outer_start) 5932 if (start != outer_start)
5931 free_contig_range(outer_start, start - outer_start); 5933 free_contig_range(outer_start, start - outer_start);
5932 if (end != outer_end) 5934 if (end != outer_end)
5933 free_contig_range(end, outer_end - end); 5935 free_contig_range(end, outer_end - end);
5934 5936
5935 done: 5937 done:
5936 undo_isolate_page_range(pfn_max_align_down(start), 5938 undo_isolate_page_range(pfn_max_align_down(start),
5937 pfn_max_align_up(end), migratetype); 5939 pfn_max_align_up(end), migratetype);
5938 return ret; 5940 return ret;
5939 } 5941 }
5940 5942
5941 void free_contig_range(unsigned long pfn, unsigned nr_pages) 5943 void free_contig_range(unsigned long pfn, unsigned nr_pages)
5942 { 5944 {
5943 unsigned int count = 0; 5945 unsigned int count = 0;
5944 5946
5945 for (; nr_pages--; pfn++) { 5947 for (; nr_pages--; pfn++) {
5946 struct page *page = pfn_to_page(pfn); 5948 struct page *page = pfn_to_page(pfn);
5947 5949
5948 count += page_count(page) != 1; 5950 count += page_count(page) != 1;
5949 __free_page(page); 5951 __free_page(page);
5950 } 5952 }
5951 WARN(count != 0, "%d pages are still in use!\n", count); 5953 WARN(count != 0, "%d pages are still in use!\n", count);
5952 } 5954 }
5953 #endif 5955 #endif
5954 5956
5955 #ifdef CONFIG_MEMORY_HOTPLUG 5957 #ifdef CONFIG_MEMORY_HOTPLUG
5956 static int __meminit __zone_pcp_update(void *data) 5958 static int __meminit __zone_pcp_update(void *data)
5957 { 5959 {
5958 struct zone *zone = data; 5960 struct zone *zone = data;
5959 int cpu; 5961 int cpu;
5960 unsigned long batch = zone_batchsize(zone), flags; 5962 unsigned long batch = zone_batchsize(zone), flags;
5961 5963
5962 for_each_possible_cpu(cpu) { 5964 for_each_possible_cpu(cpu) {
5963 struct per_cpu_pageset *pset; 5965 struct per_cpu_pageset *pset;
5964 struct per_cpu_pages *pcp; 5966 struct per_cpu_pages *pcp;
5965 5967
5966 pset = per_cpu_ptr(zone->pageset, cpu); 5968 pset = per_cpu_ptr(zone->pageset, cpu);
5967 pcp = &pset->pcp; 5969 pcp = &pset->pcp;
5968 5970
5969 local_irq_save(flags); 5971 local_irq_save(flags);
5970 if (pcp->count > 0) 5972 if (pcp->count > 0)
5971 free_pcppages_bulk(zone, pcp->count, pcp); 5973 free_pcppages_bulk(zone, pcp->count, pcp);
5972 drain_zonestat(zone, pset); 5974 drain_zonestat(zone, pset);
5973 setup_pageset(pset, batch); 5975 setup_pageset(pset, batch);
5974 local_irq_restore(flags); 5976 local_irq_restore(flags);
5975 } 5977 }
5976 return 0; 5978 return 0;
5977 } 5979 }
5978 5980
5979 void __meminit zone_pcp_update(struct zone *zone) 5981 void __meminit zone_pcp_update(struct zone *zone)
5980 { 5982 {
5981 stop_machine(__zone_pcp_update, zone, NULL); 5983 stop_machine(__zone_pcp_update, zone, NULL);
5982 } 5984 }
5983 #endif 5985 #endif
5984 5986
5985 void zone_pcp_reset(struct zone *zone) 5987 void zone_pcp_reset(struct zone *zone)
5986 { 5988 {
5987 unsigned long flags; 5989 unsigned long flags;
5988 int cpu; 5990 int cpu;
5989 struct per_cpu_pageset *pset; 5991 struct per_cpu_pageset *pset;
5990 5992
5991 /* avoid races with drain_pages() */ 5993 /* avoid races with drain_pages() */
5992 local_irq_save(flags); 5994 local_irq_save(flags);
5993 if (zone->pageset != &boot_pageset) { 5995 if (zone->pageset != &boot_pageset) {
5994 for_each_online_cpu(cpu) { 5996 for_each_online_cpu(cpu) {
5995 pset = per_cpu_ptr(zone->pageset, cpu); 5997 pset = per_cpu_ptr(zone->pageset, cpu);
5996 drain_zonestat(zone, pset); 5998 drain_zonestat(zone, pset);
5997 } 5999 }
5998 free_percpu(zone->pageset); 6000 free_percpu(zone->pageset);
5999 zone->pageset = &boot_pageset; 6001 zone->pageset = &boot_pageset;
6000 } 6002 }
6001 local_irq_restore(flags); 6003 local_irq_restore(flags);
6002 } 6004 }
6003 6005
6004 #ifdef CONFIG_MEMORY_HOTREMOVE 6006 #ifdef CONFIG_MEMORY_HOTREMOVE
6005 /* 6007 /*
6006 * All pages in the range must be isolated before calling this. 6008 * All pages in the range must be isolated before calling this.
6007 */ 6009 */
6008 void 6010 void
6009 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6011 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6010 { 6012 {
6011 struct page *page; 6013 struct page *page;
6012 struct zone *zone; 6014 struct zone *zone;
6013 int order, i; 6015 int order, i;
6014 unsigned long pfn; 6016 unsigned long pfn;
6015 unsigned long flags; 6017 unsigned long flags;
6016 /* find the first valid pfn */ 6018 /* find the first valid pfn */
6017 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6019 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6018 if (pfn_valid(pfn)) 6020 if (pfn_valid(pfn))
6019 break; 6021 break;
6020 if (pfn == end_pfn) 6022 if (pfn == end_pfn)
6021 return; 6023 return;
6022 zone = page_zone(pfn_to_page(pfn)); 6024 zone = page_zone(pfn_to_page(pfn));
6023 spin_lock_irqsave(&zone->lock, flags); 6025 spin_lock_irqsave(&zone->lock, flags);
6024 pfn = start_pfn; 6026 pfn = start_pfn;
6025 while (pfn < end_pfn) { 6027 while (pfn < end_pfn) {
6026 if (!pfn_valid(pfn)) { 6028 if (!pfn_valid(pfn)) {
6027 pfn++; 6029 pfn++;
6028 continue; 6030 continue;
6029 } 6031 }
6030 page = pfn_to_page(pfn); 6032 page = pfn_to_page(pfn);
6031 /* 6033 /*
6032 * The HWPoisoned page may be not in buddy system, and 6034 * The HWPoisoned page may be not in buddy system, and
6033 * page_count() is not 0. 6035 * page_count() is not 0.
6034 */ 6036 */
6035 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6037 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6036 pfn++; 6038 pfn++;
6037 SetPageReserved(page); 6039 SetPageReserved(page);
6038 continue; 6040 continue;
6039 } 6041 }
6040 6042
6041 BUG_ON(page_count(page)); 6043 BUG_ON(page_count(page));
6042 BUG_ON(!PageBuddy(page)); 6044 BUG_ON(!PageBuddy(page));
6043 order = page_order(page); 6045 order = page_order(page);
6044 #ifdef CONFIG_DEBUG_VM 6046 #ifdef CONFIG_DEBUG_VM
6045 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6047 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6046 pfn, 1 << order, end_pfn); 6048 pfn, 1 << order, end_pfn);
6047 #endif 6049 #endif
6048 list_del(&page->lru); 6050 list_del(&page->lru);
6049 rmv_page_order(page); 6051 rmv_page_order(page);
6050 zone->free_area[order].nr_free--; 6052 zone->free_area[order].nr_free--;
6051 for (i = 0; i < (1 << order); i++) 6053 for (i = 0; i < (1 << order); i++)
6052 SetPageReserved((page+i)); 6054 SetPageReserved((page+i));
6053 pfn += (1 << order); 6055 pfn += (1 << order);
6054 } 6056 }
6055 spin_unlock_irqrestore(&zone->lock, flags); 6057 spin_unlock_irqrestore(&zone->lock, flags);
6056 } 6058 }
6057 #endif 6059 #endif
6058 6060
6059 #ifdef CONFIG_MEMORY_FAILURE 6061 #ifdef CONFIG_MEMORY_FAILURE
6060 bool is_free_buddy_page(struct page *page) 6062 bool is_free_buddy_page(struct page *page)
6061 { 6063 {
6062 struct zone *zone = page_zone(page); 6064 struct zone *zone = page_zone(page);
6063 unsigned long pfn = page_to_pfn(page); 6065 unsigned long pfn = page_to_pfn(page);
6064 unsigned long flags; 6066 unsigned long flags;
6065 int order; 6067 int order;
6066 6068
6067 spin_lock_irqsave(&zone->lock, flags); 6069 spin_lock_irqsave(&zone->lock, flags);
6068 for (order = 0; order < MAX_ORDER; order++) { 6070 for (order = 0; order < MAX_ORDER; order++) {
6069 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6071 struct page *page_head = page - (pfn & ((1 << order) - 1));
6070 6072
6071 if (PageBuddy(page_head) && page_order(page_head) >= order) 6073 if (PageBuddy(page_head) && page_order(page_head) >= order)
6072 break; 6074 break;
6073 } 6075 }
6074 spin_unlock_irqrestore(&zone->lock, flags); 6076 spin_unlock_irqrestore(&zone->lock, flags);
6075 6077
6076 return order < MAX_ORDER; 6078 return order < MAX_ORDER;
6077 } 6079 }
6078 #endif 6080 #endif
6079 6081
6080 static const struct trace_print_flags pageflag_names[] = { 6082 static const struct trace_print_flags pageflag_names[] = {
6081 {1UL << PG_locked, "locked" }, 6083 {1UL << PG_locked, "locked" },
6082 {1UL << PG_error, "error" }, 6084 {1UL << PG_error, "error" },
6083 {1UL << PG_referenced, "referenced" }, 6085 {1UL << PG_referenced, "referenced" },
6084 {1UL << PG_uptodate, "uptodate" }, 6086 {1UL << PG_uptodate, "uptodate" },
6085 {1UL << PG_dirty, "dirty" }, 6087 {1UL << PG_dirty, "dirty" },
6086 {1UL << PG_lru, "lru" }, 6088 {1UL << PG_lru, "lru" },
6087 {1UL << PG_active, "active" }, 6089 {1UL << PG_active, "active" },
6088 {1UL << PG_slab, "slab" }, 6090 {1UL << PG_slab, "slab" },
6089 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6091 {1UL << PG_owner_priv_1, "owner_priv_1" },
6090 {1UL << PG_arch_1, "arch_1" }, 6092 {1UL << PG_arch_1, "arch_1" },
6091 {1UL << PG_reserved, "reserved" }, 6093 {1UL << PG_reserved, "reserved" },
6092 {1UL << PG_private, "private" }, 6094 {1UL << PG_private, "private" },
6093 {1UL << PG_private_2, "private_2" }, 6095 {1UL << PG_private_2, "private_2" },
6094 {1UL << PG_writeback, "writeback" }, 6096 {1UL << PG_writeback, "writeback" },
6095 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6097 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6096 {1UL << PG_head, "head" }, 6098 {1UL << PG_head, "head" },
6097 {1UL << PG_tail, "tail" }, 6099 {1UL << PG_tail, "tail" },
6098 #else 6100 #else
6099 {1UL << PG_compound, "compound" }, 6101 {1UL << PG_compound, "compound" },
6100 #endif 6102 #endif
6101 {1UL << PG_swapcache, "swapcache" }, 6103 {1UL << PG_swapcache, "swapcache" },
6102 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6104 {1UL << PG_mappedtodisk, "mappedtodisk" },
6103 {1UL << PG_reclaim, "reclaim" }, 6105 {1UL << PG_reclaim, "reclaim" },
6104 {1UL << PG_swapbacked, "swapbacked" }, 6106 {1UL << PG_swapbacked, "swapbacked" },
6105 {1UL << PG_unevictable, "unevictable" }, 6107 {1UL << PG_unevictable, "unevictable" },
6106 #ifdef CONFIG_MMU 6108 #ifdef CONFIG_MMU
6107 {1UL << PG_mlocked, "mlocked" }, 6109 {1UL << PG_mlocked, "mlocked" },
6108 #endif 6110 #endif
6109 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6111 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6110 {1UL << PG_uncached, "uncached" }, 6112 {1UL << PG_uncached, "uncached" },
6111 #endif 6113 #endif
6112 #ifdef CONFIG_MEMORY_FAILURE 6114 #ifdef CONFIG_MEMORY_FAILURE
6113 {1UL << PG_hwpoison, "hwpoison" }, 6115 {1UL << PG_hwpoison, "hwpoison" },
6114 #endif 6116 #endif
6115 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6117 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6116 {1UL << PG_compound_lock, "compound_lock" }, 6118 {1UL << PG_compound_lock, "compound_lock" },
6117 #endif 6119 #endif
6118 }; 6120 };
6119 6121
6120 static void dump_page_flags(unsigned long flags) 6122 static void dump_page_flags(unsigned long flags)
6121 { 6123 {
6122 const char *delim = ""; 6124 const char *delim = "";
6123 unsigned long mask; 6125 unsigned long mask;
6124 int i; 6126 int i;
6125 6127
6126 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6128 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6127 6129
6128 printk(KERN_ALERT "page flags: %#lx(", flags); 6130 printk(KERN_ALERT "page flags: %#lx(", flags);
6129 6131
6130 /* remove zone id */ 6132 /* remove zone id */
6131 flags &= (1UL << NR_PAGEFLAGS) - 1; 6133 flags &= (1UL << NR_PAGEFLAGS) - 1;
6132 6134
6133 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6135 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6134 6136
6135 mask = pageflag_names[i].mask; 6137 mask = pageflag_names[i].mask;
6136 if ((flags & mask) != mask) 6138 if ((flags & mask) != mask)
6137 continue; 6139 continue;
6138 6140
6139 flags &= ~mask; 6141 flags &= ~mask;
6140 printk("%s%s", delim, pageflag_names[i].name); 6142 printk("%s%s", delim, pageflag_names[i].name);
6141 delim = "|"; 6143 delim = "|";
6142 } 6144 }
6143 6145
6144 /* check for left over flags */ 6146 /* check for left over flags */
6145 if (flags) 6147 if (flags)
6146 printk("%s%#lx", delim, flags); 6148 printk("%s%#lx", delim, flags);
6147 6149
6148 printk(")\n"); 6150 printk(")\n");
6149 } 6151 }
6150 6152
6151 void dump_page(struct page *page) 6153 void dump_page(struct page *page)
6152 { 6154 {
6153 printk(KERN_ALERT 6155 printk(KERN_ALERT
6154 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6156 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6155 page, atomic_read(&page->_count), page_mapcount(page), 6157 page, atomic_read(&page->_count), page_mapcount(page),
6156 page->mapping, page->index); 6158 page->mapping, page->index);
6157 dump_page_flags(page->flags); 6159 dump_page_flags(page->flags);
6158 mem_cgroup_print_bad_page(page); 6160 mem_cgroup_print_bad_page(page);
6159 } 6161 }
6160 6162