Commit 0cbef29a782162a3896487901eca4550bfa397ef

Authored by KOSAKI Motohiro
Committed by Linus Torvalds
1 parent 52c8f6a5ae

mm: __rmqueue_fallback() should respect pageblock type

When __rmqueue_fallback() doesn't find a free block with the required size
it splits a larger page and puts the rest of the page onto the free list.

But it has one serious mistake.  When putting back, __rmqueue_fallback()
always use start_migratetype if type is not CMA.  However,
__rmqueue_fallback() is only called when all of the start_migratetype
queue is empty.  That said, __rmqueue_fallback always puts back memory to
the wrong queue except try_to_steal_freepages() changed pageblock type
(i.e.  requested size is smaller than half of page block).  The end result
is that the antifragmentation framework increases fragmenation instead of
decreasing it.

Mel's original anti fragmentation does the right thing.  But commit
47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added") broke it.

This patch restores sane and old behavior.  It also removes an incorrect
comment which was introduced by commit fef903efcf0c ("mm/page_alloc.c:
restructure free-page stealing code and fix a bug").

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 5 additions and 10 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 72
73 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 73 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
74 DEFINE_PER_CPU(int, numa_node); 74 DEFINE_PER_CPU(int, numa_node);
75 EXPORT_PER_CPU_SYMBOL(numa_node); 75 EXPORT_PER_CPU_SYMBOL(numa_node);
76 #endif 76 #endif
77 77
78 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 78 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
79 /* 79 /*
80 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 80 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
81 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 81 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
82 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 82 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
83 * defined in <linux/topology.h>. 83 * defined in <linux/topology.h>.
84 */ 84 */
85 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 85 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
86 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 86 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
87 #endif 87 #endif
88 88
89 /* 89 /*
90 * Array of node states. 90 * Array of node states.
91 */ 91 */
92 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 92 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
93 [N_POSSIBLE] = NODE_MASK_ALL, 93 [N_POSSIBLE] = NODE_MASK_ALL,
94 [N_ONLINE] = { { [0] = 1UL } }, 94 [N_ONLINE] = { { [0] = 1UL } },
95 #ifndef CONFIG_NUMA 95 #ifndef CONFIG_NUMA
96 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 96 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
97 #ifdef CONFIG_HIGHMEM 97 #ifdef CONFIG_HIGHMEM
98 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 98 [N_HIGH_MEMORY] = { { [0] = 1UL } },
99 #endif 99 #endif
100 #ifdef CONFIG_MOVABLE_NODE 100 #ifdef CONFIG_MOVABLE_NODE
101 [N_MEMORY] = { { [0] = 1UL } }, 101 [N_MEMORY] = { { [0] = 1UL } },
102 #endif 102 #endif
103 [N_CPU] = { { [0] = 1UL } }, 103 [N_CPU] = { { [0] = 1UL } },
104 #endif /* NUMA */ 104 #endif /* NUMA */
105 }; 105 };
106 EXPORT_SYMBOL(node_states); 106 EXPORT_SYMBOL(node_states);
107 107
108 /* Protect totalram_pages and zone->managed_pages */ 108 /* Protect totalram_pages and zone->managed_pages */
109 static DEFINE_SPINLOCK(managed_page_count_lock); 109 static DEFINE_SPINLOCK(managed_page_count_lock);
110 110
111 unsigned long totalram_pages __read_mostly; 111 unsigned long totalram_pages __read_mostly;
112 unsigned long totalreserve_pages __read_mostly; 112 unsigned long totalreserve_pages __read_mostly;
113 /* 113 /*
114 * When calculating the number of globally allowed dirty pages, there 114 * When calculating the number of globally allowed dirty pages, there
115 * is a certain number of per-zone reserves that should not be 115 * is a certain number of per-zone reserves that should not be
116 * considered dirtyable memory. This is the sum of those reserves 116 * considered dirtyable memory. This is the sum of those reserves
117 * over all existing zones that contribute dirtyable memory. 117 * over all existing zones that contribute dirtyable memory.
118 */ 118 */
119 unsigned long dirty_balance_reserve __read_mostly; 119 unsigned long dirty_balance_reserve __read_mostly;
120 120
121 int percpu_pagelist_fraction; 121 int percpu_pagelist_fraction;
122 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 122 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
123 123
124 #ifdef CONFIG_PM_SLEEP 124 #ifdef CONFIG_PM_SLEEP
125 /* 125 /*
126 * The following functions are used by the suspend/hibernate code to temporarily 126 * The following functions are used by the suspend/hibernate code to temporarily
127 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 127 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
128 * while devices are suspended. To avoid races with the suspend/hibernate code, 128 * while devices are suspended. To avoid races with the suspend/hibernate code,
129 * they should always be called with pm_mutex held (gfp_allowed_mask also should 129 * they should always be called with pm_mutex held (gfp_allowed_mask also should
130 * only be modified with pm_mutex held, unless the suspend/hibernate code is 130 * only be modified with pm_mutex held, unless the suspend/hibernate code is
131 * guaranteed not to run in parallel with that modification). 131 * guaranteed not to run in parallel with that modification).
132 */ 132 */
133 133
134 static gfp_t saved_gfp_mask; 134 static gfp_t saved_gfp_mask;
135 135
136 void pm_restore_gfp_mask(void) 136 void pm_restore_gfp_mask(void)
137 { 137 {
138 WARN_ON(!mutex_is_locked(&pm_mutex)); 138 WARN_ON(!mutex_is_locked(&pm_mutex));
139 if (saved_gfp_mask) { 139 if (saved_gfp_mask) {
140 gfp_allowed_mask = saved_gfp_mask; 140 gfp_allowed_mask = saved_gfp_mask;
141 saved_gfp_mask = 0; 141 saved_gfp_mask = 0;
142 } 142 }
143 } 143 }
144 144
145 void pm_restrict_gfp_mask(void) 145 void pm_restrict_gfp_mask(void)
146 { 146 {
147 WARN_ON(!mutex_is_locked(&pm_mutex)); 147 WARN_ON(!mutex_is_locked(&pm_mutex));
148 WARN_ON(saved_gfp_mask); 148 WARN_ON(saved_gfp_mask);
149 saved_gfp_mask = gfp_allowed_mask; 149 saved_gfp_mask = gfp_allowed_mask;
150 gfp_allowed_mask &= ~GFP_IOFS; 150 gfp_allowed_mask &= ~GFP_IOFS;
151 } 151 }
152 152
153 bool pm_suspended_storage(void) 153 bool pm_suspended_storage(void)
154 { 154 {
155 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 155 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
156 return false; 156 return false;
157 return true; 157 return true;
158 } 158 }
159 #endif /* CONFIG_PM_SLEEP */ 159 #endif /* CONFIG_PM_SLEEP */
160 160
161 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 161 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
162 int pageblock_order __read_mostly; 162 int pageblock_order __read_mostly;
163 #endif 163 #endif
164 164
165 static void __free_pages_ok(struct page *page, unsigned int order); 165 static void __free_pages_ok(struct page *page, unsigned int order);
166 166
167 /* 167 /*
168 * results with 256, 32 in the lowmem_reserve sysctl: 168 * results with 256, 32 in the lowmem_reserve sysctl:
169 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 169 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
170 * 1G machine -> (16M dma, 784M normal, 224M high) 170 * 1G machine -> (16M dma, 784M normal, 224M high)
171 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 171 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
172 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 172 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
173 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 173 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
174 * 174 *
175 * TBD: should special case ZONE_DMA32 machines here - in those we normally 175 * TBD: should special case ZONE_DMA32 machines here - in those we normally
176 * don't need any ZONE_NORMAL reservation 176 * don't need any ZONE_NORMAL reservation
177 */ 177 */
178 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 178 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
179 #ifdef CONFIG_ZONE_DMA 179 #ifdef CONFIG_ZONE_DMA
180 256, 180 256,
181 #endif 181 #endif
182 #ifdef CONFIG_ZONE_DMA32 182 #ifdef CONFIG_ZONE_DMA32
183 256, 183 256,
184 #endif 184 #endif
185 #ifdef CONFIG_HIGHMEM 185 #ifdef CONFIG_HIGHMEM
186 32, 186 32,
187 #endif 187 #endif
188 32, 188 32,
189 }; 189 };
190 190
191 EXPORT_SYMBOL(totalram_pages); 191 EXPORT_SYMBOL(totalram_pages);
192 192
193 static char * const zone_names[MAX_NR_ZONES] = { 193 static char * const zone_names[MAX_NR_ZONES] = {
194 #ifdef CONFIG_ZONE_DMA 194 #ifdef CONFIG_ZONE_DMA
195 "DMA", 195 "DMA",
196 #endif 196 #endif
197 #ifdef CONFIG_ZONE_DMA32 197 #ifdef CONFIG_ZONE_DMA32
198 "DMA32", 198 "DMA32",
199 #endif 199 #endif
200 "Normal", 200 "Normal",
201 #ifdef CONFIG_HIGHMEM 201 #ifdef CONFIG_HIGHMEM
202 "HighMem", 202 "HighMem",
203 #endif 203 #endif
204 "Movable", 204 "Movable",
205 }; 205 };
206 206
207 int min_free_kbytes = 1024; 207 int min_free_kbytes = 1024;
208 int user_min_free_kbytes; 208 int user_min_free_kbytes;
209 209
210 static unsigned long __meminitdata nr_kernel_pages; 210 static unsigned long __meminitdata nr_kernel_pages;
211 static unsigned long __meminitdata nr_all_pages; 211 static unsigned long __meminitdata nr_all_pages;
212 static unsigned long __meminitdata dma_reserve; 212 static unsigned long __meminitdata dma_reserve;
213 213
214 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 214 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
215 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 215 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
216 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __initdata required_kernelcore; 217 static unsigned long __initdata required_kernelcore;
218 static unsigned long __initdata required_movablecore; 218 static unsigned long __initdata required_movablecore;
219 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 219 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
220 220
221 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 221 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
222 int movable_zone; 222 int movable_zone;
223 EXPORT_SYMBOL(movable_zone); 223 EXPORT_SYMBOL(movable_zone);
224 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 224 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
225 225
226 #if MAX_NUMNODES > 1 226 #if MAX_NUMNODES > 1
227 int nr_node_ids __read_mostly = MAX_NUMNODES; 227 int nr_node_ids __read_mostly = MAX_NUMNODES;
228 int nr_online_nodes __read_mostly = 1; 228 int nr_online_nodes __read_mostly = 1;
229 EXPORT_SYMBOL(nr_node_ids); 229 EXPORT_SYMBOL(nr_node_ids);
230 EXPORT_SYMBOL(nr_online_nodes); 230 EXPORT_SYMBOL(nr_online_nodes);
231 #endif 231 #endif
232 232
233 int page_group_by_mobility_disabled __read_mostly; 233 int page_group_by_mobility_disabled __read_mostly;
234 234
235 void set_pageblock_migratetype(struct page *page, int migratetype) 235 void set_pageblock_migratetype(struct page *page, int migratetype)
236 { 236 {
237 if (unlikely(page_group_by_mobility_disabled && 237 if (unlikely(page_group_by_mobility_disabled &&
238 migratetype < MIGRATE_PCPTYPES)) 238 migratetype < MIGRATE_PCPTYPES))
239 migratetype = MIGRATE_UNMOVABLE; 239 migratetype = MIGRATE_UNMOVABLE;
240 240
241 set_pageblock_flags_group(page, (unsigned long)migratetype, 241 set_pageblock_flags_group(page, (unsigned long)migratetype,
242 PB_migrate, PB_migrate_end); 242 PB_migrate, PB_migrate_end);
243 } 243 }
244 244
245 bool oom_killer_disabled __read_mostly; 245 bool oom_killer_disabled __read_mostly;
246 246
247 #ifdef CONFIG_DEBUG_VM 247 #ifdef CONFIG_DEBUG_VM
248 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 248 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
249 { 249 {
250 int ret = 0; 250 int ret = 0;
251 unsigned seq; 251 unsigned seq;
252 unsigned long pfn = page_to_pfn(page); 252 unsigned long pfn = page_to_pfn(page);
253 unsigned long sp, start_pfn; 253 unsigned long sp, start_pfn;
254 254
255 do { 255 do {
256 seq = zone_span_seqbegin(zone); 256 seq = zone_span_seqbegin(zone);
257 start_pfn = zone->zone_start_pfn; 257 start_pfn = zone->zone_start_pfn;
258 sp = zone->spanned_pages; 258 sp = zone->spanned_pages;
259 if (!zone_spans_pfn(zone, pfn)) 259 if (!zone_spans_pfn(zone, pfn))
260 ret = 1; 260 ret = 1;
261 } while (zone_span_seqretry(zone, seq)); 261 } while (zone_span_seqretry(zone, seq));
262 262
263 if (ret) 263 if (ret)
264 pr_err("page %lu outside zone [ %lu - %lu ]\n", 264 pr_err("page %lu outside zone [ %lu - %lu ]\n",
265 pfn, start_pfn, start_pfn + sp); 265 pfn, start_pfn, start_pfn + sp);
266 266
267 return ret; 267 return ret;
268 } 268 }
269 269
270 static int page_is_consistent(struct zone *zone, struct page *page) 270 static int page_is_consistent(struct zone *zone, struct page *page)
271 { 271 {
272 if (!pfn_valid_within(page_to_pfn(page))) 272 if (!pfn_valid_within(page_to_pfn(page)))
273 return 0; 273 return 0;
274 if (zone != page_zone(page)) 274 if (zone != page_zone(page))
275 return 0; 275 return 0;
276 276
277 return 1; 277 return 1;
278 } 278 }
279 /* 279 /*
280 * Temporary debugging check for pages not lying within a given zone. 280 * Temporary debugging check for pages not lying within a given zone.
281 */ 281 */
282 static int bad_range(struct zone *zone, struct page *page) 282 static int bad_range(struct zone *zone, struct page *page)
283 { 283 {
284 if (page_outside_zone_boundaries(zone, page)) 284 if (page_outside_zone_boundaries(zone, page))
285 return 1; 285 return 1;
286 if (!page_is_consistent(zone, page)) 286 if (!page_is_consistent(zone, page))
287 return 1; 287 return 1;
288 288
289 return 0; 289 return 0;
290 } 290 }
291 #else 291 #else
292 static inline int bad_range(struct zone *zone, struct page *page) 292 static inline int bad_range(struct zone *zone, struct page *page)
293 { 293 {
294 return 0; 294 return 0;
295 } 295 }
296 #endif 296 #endif
297 297
298 static void bad_page(struct page *page) 298 static void bad_page(struct page *page)
299 { 299 {
300 static unsigned long resume; 300 static unsigned long resume;
301 static unsigned long nr_shown; 301 static unsigned long nr_shown;
302 static unsigned long nr_unshown; 302 static unsigned long nr_unshown;
303 303
304 /* Don't complain about poisoned pages */ 304 /* Don't complain about poisoned pages */
305 if (PageHWPoison(page)) { 305 if (PageHWPoison(page)) {
306 page_mapcount_reset(page); /* remove PageBuddy */ 306 page_mapcount_reset(page); /* remove PageBuddy */
307 return; 307 return;
308 } 308 }
309 309
310 /* 310 /*
311 * Allow a burst of 60 reports, then keep quiet for that minute; 311 * Allow a burst of 60 reports, then keep quiet for that minute;
312 * or allow a steady drip of one report per second. 312 * or allow a steady drip of one report per second.
313 */ 313 */
314 if (nr_shown == 60) { 314 if (nr_shown == 60) {
315 if (time_before(jiffies, resume)) { 315 if (time_before(jiffies, resume)) {
316 nr_unshown++; 316 nr_unshown++;
317 goto out; 317 goto out;
318 } 318 }
319 if (nr_unshown) { 319 if (nr_unshown) {
320 printk(KERN_ALERT 320 printk(KERN_ALERT
321 "BUG: Bad page state: %lu messages suppressed\n", 321 "BUG: Bad page state: %lu messages suppressed\n",
322 nr_unshown); 322 nr_unshown);
323 nr_unshown = 0; 323 nr_unshown = 0;
324 } 324 }
325 nr_shown = 0; 325 nr_shown = 0;
326 } 326 }
327 if (nr_shown++ == 0) 327 if (nr_shown++ == 0)
328 resume = jiffies + 60 * HZ; 328 resume = jiffies + 60 * HZ;
329 329
330 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 330 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
331 current->comm, page_to_pfn(page)); 331 current->comm, page_to_pfn(page));
332 dump_page(page); 332 dump_page(page);
333 333
334 print_modules(); 334 print_modules();
335 dump_stack(); 335 dump_stack();
336 out: 336 out:
337 /* Leave bad fields for debug, except PageBuddy could make trouble */ 337 /* Leave bad fields for debug, except PageBuddy could make trouble */
338 page_mapcount_reset(page); /* remove PageBuddy */ 338 page_mapcount_reset(page); /* remove PageBuddy */
339 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 339 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
340 } 340 }
341 341
342 /* 342 /*
343 * Higher-order pages are called "compound pages". They are structured thusly: 343 * Higher-order pages are called "compound pages". They are structured thusly:
344 * 344 *
345 * The first PAGE_SIZE page is called the "head page". 345 * The first PAGE_SIZE page is called the "head page".
346 * 346 *
347 * The remaining PAGE_SIZE pages are called "tail pages". 347 * The remaining PAGE_SIZE pages are called "tail pages".
348 * 348 *
349 * All pages have PG_compound set. All tail pages have their ->first_page 349 * All pages have PG_compound set. All tail pages have their ->first_page
350 * pointing at the head page. 350 * pointing at the head page.
351 * 351 *
352 * The first tail page's ->lru.next holds the address of the compound page's 352 * The first tail page's ->lru.next holds the address of the compound page's
353 * put_page() function. Its ->lru.prev holds the order of allocation. 353 * put_page() function. Its ->lru.prev holds the order of allocation.
354 * This usage means that zero-order pages may not be compound. 354 * This usage means that zero-order pages may not be compound.
355 */ 355 */
356 356
357 static void free_compound_page(struct page *page) 357 static void free_compound_page(struct page *page)
358 { 358 {
359 __free_pages_ok(page, compound_order(page)); 359 __free_pages_ok(page, compound_order(page));
360 } 360 }
361 361
362 void prep_compound_page(struct page *page, unsigned long order) 362 void prep_compound_page(struct page *page, unsigned long order)
363 { 363 {
364 int i; 364 int i;
365 int nr_pages = 1 << order; 365 int nr_pages = 1 << order;
366 366
367 set_compound_page_dtor(page, free_compound_page); 367 set_compound_page_dtor(page, free_compound_page);
368 set_compound_order(page, order); 368 set_compound_order(page, order);
369 __SetPageHead(page); 369 __SetPageHead(page);
370 for (i = 1; i < nr_pages; i++) { 370 for (i = 1; i < nr_pages; i++) {
371 struct page *p = page + i; 371 struct page *p = page + i;
372 __SetPageTail(p); 372 __SetPageTail(p);
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 } 375 }
376 } 376 }
377 377
378 /* update __split_huge_page_refcount if you change this function */ 378 /* update __split_huge_page_refcount if you change this function */
379 static int destroy_compound_page(struct page *page, unsigned long order) 379 static int destroy_compound_page(struct page *page, unsigned long order)
380 { 380 {
381 int i; 381 int i;
382 int nr_pages = 1 << order; 382 int nr_pages = 1 << order;
383 int bad = 0; 383 int bad = 0;
384 384
385 if (unlikely(compound_order(page) != order)) { 385 if (unlikely(compound_order(page) != order)) {
386 bad_page(page); 386 bad_page(page);
387 bad++; 387 bad++;
388 } 388 }
389 389
390 __ClearPageHead(page); 390 __ClearPageHead(page);
391 391
392 for (i = 1; i < nr_pages; i++) { 392 for (i = 1; i < nr_pages; i++) {
393 struct page *p = page + i; 393 struct page *p = page + i;
394 394
395 if (unlikely(!PageTail(p) || (p->first_page != page))) { 395 if (unlikely(!PageTail(p) || (p->first_page != page))) {
396 bad_page(page); 396 bad_page(page);
397 bad++; 397 bad++;
398 } 398 }
399 __ClearPageTail(p); 399 __ClearPageTail(p);
400 } 400 }
401 401
402 return bad; 402 return bad;
403 } 403 }
404 404
405 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 405 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
406 { 406 {
407 int i; 407 int i;
408 408
409 /* 409 /*
410 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 410 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
411 * and __GFP_HIGHMEM from hard or soft interrupt context. 411 * and __GFP_HIGHMEM from hard or soft interrupt context.
412 */ 412 */
413 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 413 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
414 for (i = 0; i < (1 << order); i++) 414 for (i = 0; i < (1 << order); i++)
415 clear_highpage(page + i); 415 clear_highpage(page + i);
416 } 416 }
417 417
418 #ifdef CONFIG_DEBUG_PAGEALLOC 418 #ifdef CONFIG_DEBUG_PAGEALLOC
419 unsigned int _debug_guardpage_minorder; 419 unsigned int _debug_guardpage_minorder;
420 420
421 static int __init debug_guardpage_minorder_setup(char *buf) 421 static int __init debug_guardpage_minorder_setup(char *buf)
422 { 422 {
423 unsigned long res; 423 unsigned long res;
424 424
425 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 425 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
426 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 426 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
427 return 0; 427 return 0;
428 } 428 }
429 _debug_guardpage_minorder = res; 429 _debug_guardpage_minorder = res;
430 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 430 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
431 return 0; 431 return 0;
432 } 432 }
433 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 433 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
434 434
435 static inline void set_page_guard_flag(struct page *page) 435 static inline void set_page_guard_flag(struct page *page)
436 { 436 {
437 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 437 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
438 } 438 }
439 439
440 static inline void clear_page_guard_flag(struct page *page) 440 static inline void clear_page_guard_flag(struct page *page)
441 { 441 {
442 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 442 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
443 } 443 }
444 #else 444 #else
445 static inline void set_page_guard_flag(struct page *page) { } 445 static inline void set_page_guard_flag(struct page *page) { }
446 static inline void clear_page_guard_flag(struct page *page) { } 446 static inline void clear_page_guard_flag(struct page *page) { }
447 #endif 447 #endif
448 448
449 static inline void set_page_order(struct page *page, int order) 449 static inline void set_page_order(struct page *page, int order)
450 { 450 {
451 set_page_private(page, order); 451 set_page_private(page, order);
452 __SetPageBuddy(page); 452 __SetPageBuddy(page);
453 } 453 }
454 454
455 static inline void rmv_page_order(struct page *page) 455 static inline void rmv_page_order(struct page *page)
456 { 456 {
457 __ClearPageBuddy(page); 457 __ClearPageBuddy(page);
458 set_page_private(page, 0); 458 set_page_private(page, 0);
459 } 459 }
460 460
461 /* 461 /*
462 * Locate the struct page for both the matching buddy in our 462 * Locate the struct page for both the matching buddy in our
463 * pair (buddy1) and the combined O(n+1) page they form (page). 463 * pair (buddy1) and the combined O(n+1) page they form (page).
464 * 464 *
465 * 1) Any buddy B1 will have an order O twin B2 which satisfies 465 * 1) Any buddy B1 will have an order O twin B2 which satisfies
466 * the following equation: 466 * the following equation:
467 * B2 = B1 ^ (1 << O) 467 * B2 = B1 ^ (1 << O)
468 * For example, if the starting buddy (buddy2) is #8 its order 468 * For example, if the starting buddy (buddy2) is #8 its order
469 * 1 buddy is #10: 469 * 1 buddy is #10:
470 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 470 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
471 * 471 *
472 * 2) Any buddy B will have an order O+1 parent P which 472 * 2) Any buddy B will have an order O+1 parent P which
473 * satisfies the following equation: 473 * satisfies the following equation:
474 * P = B & ~(1 << O) 474 * P = B & ~(1 << O)
475 * 475 *
476 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 476 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
477 */ 477 */
478 static inline unsigned long 478 static inline unsigned long
479 __find_buddy_index(unsigned long page_idx, unsigned int order) 479 __find_buddy_index(unsigned long page_idx, unsigned int order)
480 { 480 {
481 return page_idx ^ (1 << order); 481 return page_idx ^ (1 << order);
482 } 482 }
483 483
484 /* 484 /*
485 * This function checks whether a page is free && is the buddy 485 * This function checks whether a page is free && is the buddy
486 * we can do coalesce a page and its buddy if 486 * we can do coalesce a page and its buddy if
487 * (a) the buddy is not in a hole && 487 * (a) the buddy is not in a hole &&
488 * (b) the buddy is in the buddy system && 488 * (b) the buddy is in the buddy system &&
489 * (c) a page and its buddy have the same order && 489 * (c) a page and its buddy have the same order &&
490 * (d) a page and its buddy are in the same zone. 490 * (d) a page and its buddy are in the same zone.
491 * 491 *
492 * For recording whether a page is in the buddy system, we set ->_mapcount 492 * For recording whether a page is in the buddy system, we set ->_mapcount
493 * PAGE_BUDDY_MAPCOUNT_VALUE. 493 * PAGE_BUDDY_MAPCOUNT_VALUE.
494 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 494 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
495 * serialized by zone->lock. 495 * serialized by zone->lock.
496 * 496 *
497 * For recording page's order, we use page_private(page). 497 * For recording page's order, we use page_private(page).
498 */ 498 */
499 static inline int page_is_buddy(struct page *page, struct page *buddy, 499 static inline int page_is_buddy(struct page *page, struct page *buddy,
500 int order) 500 int order)
501 { 501 {
502 if (!pfn_valid_within(page_to_pfn(buddy))) 502 if (!pfn_valid_within(page_to_pfn(buddy)))
503 return 0; 503 return 0;
504 504
505 if (page_zone_id(page) != page_zone_id(buddy)) 505 if (page_zone_id(page) != page_zone_id(buddy))
506 return 0; 506 return 0;
507 507
508 if (page_is_guard(buddy) && page_order(buddy) == order) { 508 if (page_is_guard(buddy) && page_order(buddy) == order) {
509 VM_BUG_ON(page_count(buddy) != 0); 509 VM_BUG_ON(page_count(buddy) != 0);
510 return 1; 510 return 1;
511 } 511 }
512 512
513 if (PageBuddy(buddy) && page_order(buddy) == order) { 513 if (PageBuddy(buddy) && page_order(buddy) == order) {
514 VM_BUG_ON(page_count(buddy) != 0); 514 VM_BUG_ON(page_count(buddy) != 0);
515 return 1; 515 return 1;
516 } 516 }
517 return 0; 517 return 0;
518 } 518 }
519 519
520 /* 520 /*
521 * Freeing function for a buddy system allocator. 521 * Freeing function for a buddy system allocator.
522 * 522 *
523 * The concept of a buddy system is to maintain direct-mapped table 523 * The concept of a buddy system is to maintain direct-mapped table
524 * (containing bit values) for memory blocks of various "orders". 524 * (containing bit values) for memory blocks of various "orders".
525 * The bottom level table contains the map for the smallest allocatable 525 * The bottom level table contains the map for the smallest allocatable
526 * units of memory (here, pages), and each level above it describes 526 * units of memory (here, pages), and each level above it describes
527 * pairs of units from the levels below, hence, "buddies". 527 * pairs of units from the levels below, hence, "buddies".
528 * At a high level, all that happens here is marking the table entry 528 * At a high level, all that happens here is marking the table entry
529 * at the bottom level available, and propagating the changes upward 529 * at the bottom level available, and propagating the changes upward
530 * as necessary, plus some accounting needed to play nicely with other 530 * as necessary, plus some accounting needed to play nicely with other
531 * parts of the VM system. 531 * parts of the VM system.
532 * At each level, we keep a list of pages, which are heads of continuous 532 * At each level, we keep a list of pages, which are heads of continuous
533 * free pages of length of (1 << order) and marked with _mapcount 533 * free pages of length of (1 << order) and marked with _mapcount
534 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 534 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
535 * field. 535 * field.
536 * So when we are allocating or freeing one, we can derive the state of the 536 * So when we are allocating or freeing one, we can derive the state of the
537 * other. That is, if we allocate a small block, and both were 537 * other. That is, if we allocate a small block, and both were
538 * free, the remainder of the region must be split into blocks. 538 * free, the remainder of the region must be split into blocks.
539 * If a block is freed, and its buddy is also free, then this 539 * If a block is freed, and its buddy is also free, then this
540 * triggers coalescing into a block of larger size. 540 * triggers coalescing into a block of larger size.
541 * 541 *
542 * -- nyc 542 * -- nyc
543 */ 543 */
544 544
545 static inline void __free_one_page(struct page *page, 545 static inline void __free_one_page(struct page *page,
546 struct zone *zone, unsigned int order, 546 struct zone *zone, unsigned int order,
547 int migratetype) 547 int migratetype)
548 { 548 {
549 unsigned long page_idx; 549 unsigned long page_idx;
550 unsigned long combined_idx; 550 unsigned long combined_idx;
551 unsigned long uninitialized_var(buddy_idx); 551 unsigned long uninitialized_var(buddy_idx);
552 struct page *buddy; 552 struct page *buddy;
553 553
554 VM_BUG_ON(!zone_is_initialized(zone)); 554 VM_BUG_ON(!zone_is_initialized(zone));
555 555
556 if (unlikely(PageCompound(page))) 556 if (unlikely(PageCompound(page)))
557 if (unlikely(destroy_compound_page(page, order))) 557 if (unlikely(destroy_compound_page(page, order)))
558 return; 558 return;
559 559
560 VM_BUG_ON(migratetype == -1); 560 VM_BUG_ON(migratetype == -1);
561 561
562 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 562 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
563 563
564 VM_BUG_ON(page_idx & ((1 << order) - 1)); 564 VM_BUG_ON(page_idx & ((1 << order) - 1));
565 VM_BUG_ON(bad_range(zone, page)); 565 VM_BUG_ON(bad_range(zone, page));
566 566
567 while (order < MAX_ORDER-1) { 567 while (order < MAX_ORDER-1) {
568 buddy_idx = __find_buddy_index(page_idx, order); 568 buddy_idx = __find_buddy_index(page_idx, order);
569 buddy = page + (buddy_idx - page_idx); 569 buddy = page + (buddy_idx - page_idx);
570 if (!page_is_buddy(page, buddy, order)) 570 if (!page_is_buddy(page, buddy, order))
571 break; 571 break;
572 /* 572 /*
573 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 573 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
574 * merge with it and move up one order. 574 * merge with it and move up one order.
575 */ 575 */
576 if (page_is_guard(buddy)) { 576 if (page_is_guard(buddy)) {
577 clear_page_guard_flag(buddy); 577 clear_page_guard_flag(buddy);
578 set_page_private(page, 0); 578 set_page_private(page, 0);
579 __mod_zone_freepage_state(zone, 1 << order, 579 __mod_zone_freepage_state(zone, 1 << order,
580 migratetype); 580 migratetype);
581 } else { 581 } else {
582 list_del(&buddy->lru); 582 list_del(&buddy->lru);
583 zone->free_area[order].nr_free--; 583 zone->free_area[order].nr_free--;
584 rmv_page_order(buddy); 584 rmv_page_order(buddy);
585 } 585 }
586 combined_idx = buddy_idx & page_idx; 586 combined_idx = buddy_idx & page_idx;
587 page = page + (combined_idx - page_idx); 587 page = page + (combined_idx - page_idx);
588 page_idx = combined_idx; 588 page_idx = combined_idx;
589 order++; 589 order++;
590 } 590 }
591 set_page_order(page, order); 591 set_page_order(page, order);
592 592
593 /* 593 /*
594 * If this is not the largest possible page, check if the buddy 594 * If this is not the largest possible page, check if the buddy
595 * of the next-highest order is free. If it is, it's possible 595 * of the next-highest order is free. If it is, it's possible
596 * that pages are being freed that will coalesce soon. In case, 596 * that pages are being freed that will coalesce soon. In case,
597 * that is happening, add the free page to the tail of the list 597 * that is happening, add the free page to the tail of the list
598 * so it's less likely to be used soon and more likely to be merged 598 * so it's less likely to be used soon and more likely to be merged
599 * as a higher order page 599 * as a higher order page
600 */ 600 */
601 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 601 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
602 struct page *higher_page, *higher_buddy; 602 struct page *higher_page, *higher_buddy;
603 combined_idx = buddy_idx & page_idx; 603 combined_idx = buddy_idx & page_idx;
604 higher_page = page + (combined_idx - page_idx); 604 higher_page = page + (combined_idx - page_idx);
605 buddy_idx = __find_buddy_index(combined_idx, order + 1); 605 buddy_idx = __find_buddy_index(combined_idx, order + 1);
606 higher_buddy = higher_page + (buddy_idx - combined_idx); 606 higher_buddy = higher_page + (buddy_idx - combined_idx);
607 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 607 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
608 list_add_tail(&page->lru, 608 list_add_tail(&page->lru,
609 &zone->free_area[order].free_list[migratetype]); 609 &zone->free_area[order].free_list[migratetype]);
610 goto out; 610 goto out;
611 } 611 }
612 } 612 }
613 613
614 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 614 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
615 out: 615 out:
616 zone->free_area[order].nr_free++; 616 zone->free_area[order].nr_free++;
617 } 617 }
618 618
619 static inline int free_pages_check(struct page *page) 619 static inline int free_pages_check(struct page *page)
620 { 620 {
621 if (unlikely(page_mapcount(page) | 621 if (unlikely(page_mapcount(page) |
622 (page->mapping != NULL) | 622 (page->mapping != NULL) |
623 (atomic_read(&page->_count) != 0) | 623 (atomic_read(&page->_count) != 0) |
624 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 624 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
625 (mem_cgroup_bad_page_check(page)))) { 625 (mem_cgroup_bad_page_check(page)))) {
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_cpupid_reset_last(page); 629 page_cpupid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
633 } 633 }
634 634
635 /* 635 /*
636 * Frees a number of pages from the PCP lists 636 * Frees a number of pages from the PCP lists
637 * Assumes all pages on list are in same zone, and of same order. 637 * Assumes all pages on list are in same zone, and of same order.
638 * count is the number of pages to free. 638 * count is the number of pages to free.
639 * 639 *
640 * If the zone was previously in an "all pages pinned" state then look to 640 * If the zone was previously in an "all pages pinned" state then look to
641 * see if this freeing clears that state. 641 * see if this freeing clears that state.
642 * 642 *
643 * And clear the zone's pages_scanned counter, to hold off the "all pages are 643 * And clear the zone's pages_scanned counter, to hold off the "all pages are
644 * pinned" detection logic. 644 * pinned" detection logic.
645 */ 645 */
646 static void free_pcppages_bulk(struct zone *zone, int count, 646 static void free_pcppages_bulk(struct zone *zone, int count,
647 struct per_cpu_pages *pcp) 647 struct per_cpu_pages *pcp)
648 { 648 {
649 int migratetype = 0; 649 int migratetype = 0;
650 int batch_free = 0; 650 int batch_free = 0;
651 int to_free = count; 651 int to_free = count;
652 652
653 spin_lock(&zone->lock); 653 spin_lock(&zone->lock);
654 zone->pages_scanned = 0; 654 zone->pages_scanned = 0;
655 655
656 while (to_free) { 656 while (to_free) {
657 struct page *page; 657 struct page *page;
658 struct list_head *list; 658 struct list_head *list;
659 659
660 /* 660 /*
661 * Remove pages from lists in a round-robin fashion. A 661 * Remove pages from lists in a round-robin fashion. A
662 * batch_free count is maintained that is incremented when an 662 * batch_free count is maintained that is incremented when an
663 * empty list is encountered. This is so more pages are freed 663 * empty list is encountered. This is so more pages are freed
664 * off fuller lists instead of spinning excessively around empty 664 * off fuller lists instead of spinning excessively around empty
665 * lists 665 * lists
666 */ 666 */
667 do { 667 do {
668 batch_free++; 668 batch_free++;
669 if (++migratetype == MIGRATE_PCPTYPES) 669 if (++migratetype == MIGRATE_PCPTYPES)
670 migratetype = 0; 670 migratetype = 0;
671 list = &pcp->lists[migratetype]; 671 list = &pcp->lists[migratetype];
672 } while (list_empty(list)); 672 } while (list_empty(list));
673 673
674 /* This is the only non-empty list. Free them all. */ 674 /* This is the only non-empty list. Free them all. */
675 if (batch_free == MIGRATE_PCPTYPES) 675 if (batch_free == MIGRATE_PCPTYPES)
676 batch_free = to_free; 676 batch_free = to_free;
677 677
678 do { 678 do {
679 int mt; /* migratetype of the to-be-freed page */ 679 int mt; /* migratetype of the to-be-freed page */
680 680
681 page = list_entry(list->prev, struct page, lru); 681 page = list_entry(list->prev, struct page, lru);
682 /* must delete as __free_one_page list manipulates */ 682 /* must delete as __free_one_page list manipulates */
683 list_del(&page->lru); 683 list_del(&page->lru);
684 mt = get_freepage_migratetype(page); 684 mt = get_freepage_migratetype(page);
685 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 685 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
686 __free_one_page(page, zone, 0, mt); 686 __free_one_page(page, zone, 0, mt);
687 trace_mm_page_pcpu_drain(page, 0, mt); 687 trace_mm_page_pcpu_drain(page, 0, mt);
688 if (likely(!is_migrate_isolate_page(page))) { 688 if (likely(!is_migrate_isolate_page(page))) {
689 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 689 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
690 if (is_migrate_cma(mt)) 690 if (is_migrate_cma(mt))
691 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 691 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
692 } 692 }
693 } while (--to_free && --batch_free && !list_empty(list)); 693 } while (--to_free && --batch_free && !list_empty(list));
694 } 694 }
695 spin_unlock(&zone->lock); 695 spin_unlock(&zone->lock);
696 } 696 }
697 697
698 static void free_one_page(struct zone *zone, struct page *page, int order, 698 static void free_one_page(struct zone *zone, struct page *page, int order,
699 int migratetype) 699 int migratetype)
700 { 700 {
701 spin_lock(&zone->lock); 701 spin_lock(&zone->lock);
702 zone->pages_scanned = 0; 702 zone->pages_scanned = 0;
703 703
704 __free_one_page(page, zone, order, migratetype); 704 __free_one_page(page, zone, order, migratetype);
705 if (unlikely(!is_migrate_isolate(migratetype))) 705 if (unlikely(!is_migrate_isolate(migratetype)))
706 __mod_zone_freepage_state(zone, 1 << order, migratetype); 706 __mod_zone_freepage_state(zone, 1 << order, migratetype);
707 spin_unlock(&zone->lock); 707 spin_unlock(&zone->lock);
708 } 708 }
709 709
710 static bool free_pages_prepare(struct page *page, unsigned int order) 710 static bool free_pages_prepare(struct page *page, unsigned int order)
711 { 711 {
712 int i; 712 int i;
713 int bad = 0; 713 int bad = 0;
714 714
715 trace_mm_page_free(page, order); 715 trace_mm_page_free(page, order);
716 kmemcheck_free_shadow(page, order); 716 kmemcheck_free_shadow(page, order);
717 717
718 if (PageAnon(page)) 718 if (PageAnon(page))
719 page->mapping = NULL; 719 page->mapping = NULL;
720 for (i = 0; i < (1 << order); i++) 720 for (i = 0; i < (1 << order); i++)
721 bad += free_pages_check(page + i); 721 bad += free_pages_check(page + i);
722 if (bad) 722 if (bad)
723 return false; 723 return false;
724 724
725 if (!PageHighMem(page)) { 725 if (!PageHighMem(page)) {
726 debug_check_no_locks_freed(page_address(page), 726 debug_check_no_locks_freed(page_address(page),
727 PAGE_SIZE << order); 727 PAGE_SIZE << order);
728 debug_check_no_obj_freed(page_address(page), 728 debug_check_no_obj_freed(page_address(page),
729 PAGE_SIZE << order); 729 PAGE_SIZE << order);
730 } 730 }
731 arch_free_page(page, order); 731 arch_free_page(page, order);
732 kernel_map_pages(page, 1 << order, 0); 732 kernel_map_pages(page, 1 << order, 0);
733 733
734 return true; 734 return true;
735 } 735 }
736 736
737 static void __free_pages_ok(struct page *page, unsigned int order) 737 static void __free_pages_ok(struct page *page, unsigned int order)
738 { 738 {
739 unsigned long flags; 739 unsigned long flags;
740 int migratetype; 740 int migratetype;
741 741
742 if (!free_pages_prepare(page, order)) 742 if (!free_pages_prepare(page, order))
743 return; 743 return;
744 744
745 local_irq_save(flags); 745 local_irq_save(flags);
746 __count_vm_events(PGFREE, 1 << order); 746 __count_vm_events(PGFREE, 1 << order);
747 migratetype = get_pageblock_migratetype(page); 747 migratetype = get_pageblock_migratetype(page);
748 set_freepage_migratetype(page, migratetype); 748 set_freepage_migratetype(page, migratetype);
749 free_one_page(page_zone(page), page, order, migratetype); 749 free_one_page(page_zone(page), page, order, migratetype);
750 local_irq_restore(flags); 750 local_irq_restore(flags);
751 } 751 }
752 752
753 void __init __free_pages_bootmem(struct page *page, unsigned int order) 753 void __init __free_pages_bootmem(struct page *page, unsigned int order)
754 { 754 {
755 unsigned int nr_pages = 1 << order; 755 unsigned int nr_pages = 1 << order;
756 struct page *p = page; 756 struct page *p = page;
757 unsigned int loop; 757 unsigned int loop;
758 758
759 prefetchw(p); 759 prefetchw(p);
760 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 760 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
761 prefetchw(p + 1); 761 prefetchw(p + 1);
762 __ClearPageReserved(p); 762 __ClearPageReserved(p);
763 set_page_count(p, 0); 763 set_page_count(p, 0);
764 } 764 }
765 __ClearPageReserved(p); 765 __ClearPageReserved(p);
766 set_page_count(p, 0); 766 set_page_count(p, 0);
767 767
768 page_zone(page)->managed_pages += nr_pages; 768 page_zone(page)->managed_pages += nr_pages;
769 set_page_refcounted(page); 769 set_page_refcounted(page);
770 __free_pages(page, order); 770 __free_pages(page, order);
771 } 771 }
772 772
773 #ifdef CONFIG_CMA 773 #ifdef CONFIG_CMA
774 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 774 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
775 void __init init_cma_reserved_pageblock(struct page *page) 775 void __init init_cma_reserved_pageblock(struct page *page)
776 { 776 {
777 unsigned i = pageblock_nr_pages; 777 unsigned i = pageblock_nr_pages;
778 struct page *p = page; 778 struct page *p = page;
779 779
780 do { 780 do {
781 __ClearPageReserved(p); 781 __ClearPageReserved(p);
782 set_page_count(p, 0); 782 set_page_count(p, 0);
783 } while (++p, --i); 783 } while (++p, --i);
784 784
785 set_page_refcounted(page); 785 set_page_refcounted(page);
786 set_pageblock_migratetype(page, MIGRATE_CMA); 786 set_pageblock_migratetype(page, MIGRATE_CMA);
787 __free_pages(page, pageblock_order); 787 __free_pages(page, pageblock_order);
788 adjust_managed_page_count(page, pageblock_nr_pages); 788 adjust_managed_page_count(page, pageblock_nr_pages);
789 } 789 }
790 #endif 790 #endif
791 791
792 /* 792 /*
793 * The order of subdivision here is critical for the IO subsystem. 793 * The order of subdivision here is critical for the IO subsystem.
794 * Please do not alter this order without good reasons and regression 794 * Please do not alter this order without good reasons and regression
795 * testing. Specifically, as large blocks of memory are subdivided, 795 * testing. Specifically, as large blocks of memory are subdivided,
796 * the order in which smaller blocks are delivered depends on the order 796 * the order in which smaller blocks are delivered depends on the order
797 * they're subdivided in this function. This is the primary factor 797 * they're subdivided in this function. This is the primary factor
798 * influencing the order in which pages are delivered to the IO 798 * influencing the order in which pages are delivered to the IO
799 * subsystem according to empirical testing, and this is also justified 799 * subsystem according to empirical testing, and this is also justified
800 * by considering the behavior of a buddy system containing a single 800 * by considering the behavior of a buddy system containing a single
801 * large block of memory acted on by a series of small allocations. 801 * large block of memory acted on by a series of small allocations.
802 * This behavior is a critical factor in sglist merging's success. 802 * This behavior is a critical factor in sglist merging's success.
803 * 803 *
804 * -- nyc 804 * -- nyc
805 */ 805 */
806 static inline void expand(struct zone *zone, struct page *page, 806 static inline void expand(struct zone *zone, struct page *page,
807 int low, int high, struct free_area *area, 807 int low, int high, struct free_area *area,
808 int migratetype) 808 int migratetype)
809 { 809 {
810 unsigned long size = 1 << high; 810 unsigned long size = 1 << high;
811 811
812 while (high > low) { 812 while (high > low) {
813 area--; 813 area--;
814 high--; 814 high--;
815 size >>= 1; 815 size >>= 1;
816 VM_BUG_ON(bad_range(zone, &page[size])); 816 VM_BUG_ON(bad_range(zone, &page[size]));
817 817
818 #ifdef CONFIG_DEBUG_PAGEALLOC 818 #ifdef CONFIG_DEBUG_PAGEALLOC
819 if (high < debug_guardpage_minorder()) { 819 if (high < debug_guardpage_minorder()) {
820 /* 820 /*
821 * Mark as guard pages (or page), that will allow to 821 * Mark as guard pages (or page), that will allow to
822 * merge back to allocator when buddy will be freed. 822 * merge back to allocator when buddy will be freed.
823 * Corresponding page table entries will not be touched, 823 * Corresponding page table entries will not be touched,
824 * pages will stay not present in virtual address space 824 * pages will stay not present in virtual address space
825 */ 825 */
826 INIT_LIST_HEAD(&page[size].lru); 826 INIT_LIST_HEAD(&page[size].lru);
827 set_page_guard_flag(&page[size]); 827 set_page_guard_flag(&page[size]);
828 set_page_private(&page[size], high); 828 set_page_private(&page[size], high);
829 /* Guard pages are not available for any usage */ 829 /* Guard pages are not available for any usage */
830 __mod_zone_freepage_state(zone, -(1 << high), 830 __mod_zone_freepage_state(zone, -(1 << high),
831 migratetype); 831 migratetype);
832 continue; 832 continue;
833 } 833 }
834 #endif 834 #endif
835 list_add(&page[size].lru, &area->free_list[migratetype]); 835 list_add(&page[size].lru, &area->free_list[migratetype]);
836 area->nr_free++; 836 area->nr_free++;
837 set_page_order(&page[size], high); 837 set_page_order(&page[size], high);
838 } 838 }
839 } 839 }
840 840
841 /* 841 /*
842 * This page is about to be returned from the page allocator 842 * This page is about to be returned from the page allocator
843 */ 843 */
844 static inline int check_new_page(struct page *page) 844 static inline int check_new_page(struct page *page)
845 { 845 {
846 if (unlikely(page_mapcount(page) | 846 if (unlikely(page_mapcount(page) |
847 (page->mapping != NULL) | 847 (page->mapping != NULL) |
848 (atomic_read(&page->_count) != 0) | 848 (atomic_read(&page->_count) != 0) |
849 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 849 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
850 (mem_cgroup_bad_page_check(page)))) { 850 (mem_cgroup_bad_page_check(page)))) {
851 bad_page(page); 851 bad_page(page);
852 return 1; 852 return 1;
853 } 853 }
854 return 0; 854 return 0;
855 } 855 }
856 856
857 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 857 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
858 { 858 {
859 int i; 859 int i;
860 860
861 for (i = 0; i < (1 << order); i++) { 861 for (i = 0; i < (1 << order); i++) {
862 struct page *p = page + i; 862 struct page *p = page + i;
863 if (unlikely(check_new_page(p))) 863 if (unlikely(check_new_page(p)))
864 return 1; 864 return 1;
865 } 865 }
866 866
867 set_page_private(page, 0); 867 set_page_private(page, 0);
868 set_page_refcounted(page); 868 set_page_refcounted(page);
869 869
870 arch_alloc_page(page, order); 870 arch_alloc_page(page, order);
871 kernel_map_pages(page, 1 << order, 1); 871 kernel_map_pages(page, 1 << order, 1);
872 872
873 if (gfp_flags & __GFP_ZERO) 873 if (gfp_flags & __GFP_ZERO)
874 prep_zero_page(page, order, gfp_flags); 874 prep_zero_page(page, order, gfp_flags);
875 875
876 if (order && (gfp_flags & __GFP_COMP)) 876 if (order && (gfp_flags & __GFP_COMP))
877 prep_compound_page(page, order); 877 prep_compound_page(page, order);
878 878
879 return 0; 879 return 0;
880 } 880 }
881 881
882 /* 882 /*
883 * Go through the free lists for the given migratetype and remove 883 * Go through the free lists for the given migratetype and remove
884 * the smallest available page from the freelists 884 * the smallest available page from the freelists
885 */ 885 */
886 static inline 886 static inline
887 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 887 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
888 int migratetype) 888 int migratetype)
889 { 889 {
890 unsigned int current_order; 890 unsigned int current_order;
891 struct free_area *area; 891 struct free_area *area;
892 struct page *page; 892 struct page *page;
893 893
894 /* Find a page of the appropriate size in the preferred list */ 894 /* Find a page of the appropriate size in the preferred list */
895 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 895 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
896 area = &(zone->free_area[current_order]); 896 area = &(zone->free_area[current_order]);
897 if (list_empty(&area->free_list[migratetype])) 897 if (list_empty(&area->free_list[migratetype]))
898 continue; 898 continue;
899 899
900 page = list_entry(area->free_list[migratetype].next, 900 page = list_entry(area->free_list[migratetype].next,
901 struct page, lru); 901 struct page, lru);
902 list_del(&page->lru); 902 list_del(&page->lru);
903 rmv_page_order(page); 903 rmv_page_order(page);
904 area->nr_free--; 904 area->nr_free--;
905 expand(zone, page, order, current_order, area, migratetype); 905 expand(zone, page, order, current_order, area, migratetype);
906 return page; 906 return page;
907 } 907 }
908 908
909 return NULL; 909 return NULL;
910 } 910 }
911 911
912 912
913 /* 913 /*
914 * This array describes the order lists are fallen back to when 914 * This array describes the order lists are fallen back to when
915 * the free lists for the desirable migrate type are depleted 915 * the free lists for the desirable migrate type are depleted
916 */ 916 */
917 static int fallbacks[MIGRATE_TYPES][4] = { 917 static int fallbacks[MIGRATE_TYPES][4] = {
918 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 918 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
919 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 919 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
920 #ifdef CONFIG_CMA 920 #ifdef CONFIG_CMA
921 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 921 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
922 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 922 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
923 #else 923 #else
924 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 924 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
925 #endif 925 #endif
926 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 926 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
927 #ifdef CONFIG_MEMORY_ISOLATION 927 #ifdef CONFIG_MEMORY_ISOLATION
928 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 928 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
929 #endif 929 #endif
930 }; 930 };
931 931
932 /* 932 /*
933 * Move the free pages in a range to the free lists of the requested type. 933 * Move the free pages in a range to the free lists of the requested type.
934 * Note that start_page and end_pages are not aligned on a pageblock 934 * Note that start_page and end_pages are not aligned on a pageblock
935 * boundary. If alignment is required, use move_freepages_block() 935 * boundary. If alignment is required, use move_freepages_block()
936 */ 936 */
937 int move_freepages(struct zone *zone, 937 int move_freepages(struct zone *zone,
938 struct page *start_page, struct page *end_page, 938 struct page *start_page, struct page *end_page,
939 int migratetype) 939 int migratetype)
940 { 940 {
941 struct page *page; 941 struct page *page;
942 unsigned long order; 942 unsigned long order;
943 int pages_moved = 0; 943 int pages_moved = 0;
944 944
945 #ifndef CONFIG_HOLES_IN_ZONE 945 #ifndef CONFIG_HOLES_IN_ZONE
946 /* 946 /*
947 * page_zone is not safe to call in this context when 947 * page_zone is not safe to call in this context when
948 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 948 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
949 * anyway as we check zone boundaries in move_freepages_block(). 949 * anyway as we check zone boundaries in move_freepages_block().
950 * Remove at a later date when no bug reports exist related to 950 * Remove at a later date when no bug reports exist related to
951 * grouping pages by mobility 951 * grouping pages by mobility
952 */ 952 */
953 BUG_ON(page_zone(start_page) != page_zone(end_page)); 953 BUG_ON(page_zone(start_page) != page_zone(end_page));
954 #endif 954 #endif
955 955
956 for (page = start_page; page <= end_page;) { 956 for (page = start_page; page <= end_page;) {
957 /* Make sure we are not inadvertently changing nodes */ 957 /* Make sure we are not inadvertently changing nodes */
958 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 958 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
959 959
960 if (!pfn_valid_within(page_to_pfn(page))) { 960 if (!pfn_valid_within(page_to_pfn(page))) {
961 page++; 961 page++;
962 continue; 962 continue;
963 } 963 }
964 964
965 if (!PageBuddy(page)) { 965 if (!PageBuddy(page)) {
966 page++; 966 page++;
967 continue; 967 continue;
968 } 968 }
969 969
970 order = page_order(page); 970 order = page_order(page);
971 list_move(&page->lru, 971 list_move(&page->lru,
972 &zone->free_area[order].free_list[migratetype]); 972 &zone->free_area[order].free_list[migratetype]);
973 set_freepage_migratetype(page, migratetype); 973 set_freepage_migratetype(page, migratetype);
974 page += 1 << order; 974 page += 1 << order;
975 pages_moved += 1 << order; 975 pages_moved += 1 << order;
976 } 976 }
977 977
978 return pages_moved; 978 return pages_moved;
979 } 979 }
980 980
981 int move_freepages_block(struct zone *zone, struct page *page, 981 int move_freepages_block(struct zone *zone, struct page *page,
982 int migratetype) 982 int migratetype)
983 { 983 {
984 unsigned long start_pfn, end_pfn; 984 unsigned long start_pfn, end_pfn;
985 struct page *start_page, *end_page; 985 struct page *start_page, *end_page;
986 986
987 start_pfn = page_to_pfn(page); 987 start_pfn = page_to_pfn(page);
988 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 988 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
989 start_page = pfn_to_page(start_pfn); 989 start_page = pfn_to_page(start_pfn);
990 end_page = start_page + pageblock_nr_pages - 1; 990 end_page = start_page + pageblock_nr_pages - 1;
991 end_pfn = start_pfn + pageblock_nr_pages - 1; 991 end_pfn = start_pfn + pageblock_nr_pages - 1;
992 992
993 /* Do not cross zone boundaries */ 993 /* Do not cross zone boundaries */
994 if (!zone_spans_pfn(zone, start_pfn)) 994 if (!zone_spans_pfn(zone, start_pfn))
995 start_page = page; 995 start_page = page;
996 if (!zone_spans_pfn(zone, end_pfn)) 996 if (!zone_spans_pfn(zone, end_pfn))
997 return 0; 997 return 0;
998 998
999 return move_freepages(zone, start_page, end_page, migratetype); 999 return move_freepages(zone, start_page, end_page, migratetype);
1000 } 1000 }
1001 1001
1002 static void change_pageblock_range(struct page *pageblock_page, 1002 static void change_pageblock_range(struct page *pageblock_page,
1003 int start_order, int migratetype) 1003 int start_order, int migratetype)
1004 { 1004 {
1005 int nr_pageblocks = 1 << (start_order - pageblock_order); 1005 int nr_pageblocks = 1 << (start_order - pageblock_order);
1006 1006
1007 while (nr_pageblocks--) { 1007 while (nr_pageblocks--) {
1008 set_pageblock_migratetype(pageblock_page, migratetype); 1008 set_pageblock_migratetype(pageblock_page, migratetype);
1009 pageblock_page += pageblock_nr_pages; 1009 pageblock_page += pageblock_nr_pages;
1010 } 1010 }
1011 } 1011 }
1012 1012
1013 /* 1013 /*
1014 * If breaking a large block of pages, move all free pages to the preferred 1014 * If breaking a large block of pages, move all free pages to the preferred
1015 * allocation list. If falling back for a reclaimable kernel allocation, be 1015 * allocation list. If falling back for a reclaimable kernel allocation, be
1016 * more aggressive about taking ownership of free pages. 1016 * more aggressive about taking ownership of free pages.
1017 * 1017 *
1018 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1018 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1019 * nor move CMA pages to different free lists. We don't want unmovable pages 1019 * nor move CMA pages to different free lists. We don't want unmovable pages
1020 * to be allocated from MIGRATE_CMA areas. 1020 * to be allocated from MIGRATE_CMA areas.
1021 * 1021 *
1022 * Returns the new migratetype of the pageblock (or the same old migratetype 1022 * Returns the new migratetype of the pageblock (or the same old migratetype
1023 * if it was unchanged). 1023 * if it was unchanged).
1024 */ 1024 */
1025 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1025 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1026 int start_type, int fallback_type) 1026 int start_type, int fallback_type)
1027 { 1027 {
1028 int current_order = page_order(page); 1028 int current_order = page_order(page);
1029 1029
1030 /*
1031 * When borrowing from MIGRATE_CMA, we need to release the excess
1032 * buddy pages to CMA itself.
1033 */
1030 if (is_migrate_cma(fallback_type)) 1034 if (is_migrate_cma(fallback_type))
1031 return fallback_type; 1035 return fallback_type;
1032 1036
1033 /* Take ownership for orders >= pageblock_order */ 1037 /* Take ownership for orders >= pageblock_order */
1034 if (current_order >= pageblock_order) { 1038 if (current_order >= pageblock_order) {
1035 change_pageblock_range(page, current_order, start_type); 1039 change_pageblock_range(page, current_order, start_type);
1036 return start_type; 1040 return start_type;
1037 } 1041 }
1038 1042
1039 if (current_order >= pageblock_order / 2 || 1043 if (current_order >= pageblock_order / 2 ||
1040 start_type == MIGRATE_RECLAIMABLE || 1044 start_type == MIGRATE_RECLAIMABLE ||
1041 page_group_by_mobility_disabled) { 1045 page_group_by_mobility_disabled) {
1042 int pages; 1046 int pages;
1043 1047
1044 pages = move_freepages_block(zone, page, start_type); 1048 pages = move_freepages_block(zone, page, start_type);
1045 1049
1046 /* Claim the whole block if over half of it is free */ 1050 /* Claim the whole block if over half of it is free */
1047 if (pages >= (1 << (pageblock_order-1)) || 1051 if (pages >= (1 << (pageblock_order-1)) ||
1048 page_group_by_mobility_disabled) { 1052 page_group_by_mobility_disabled) {
1049 1053
1050 set_pageblock_migratetype(page, start_type); 1054 set_pageblock_migratetype(page, start_type);
1051 return start_type; 1055 return start_type;
1052 } 1056 }
1053 1057
1054 } 1058 }
1055 1059
1056 return fallback_type; 1060 return fallback_type;
1057 } 1061 }
1058 1062
1059 /* Remove an element from the buddy allocator from the fallback list */ 1063 /* Remove an element from the buddy allocator from the fallback list */
1060 static inline struct page * 1064 static inline struct page *
1061 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1065 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1062 { 1066 {
1063 struct free_area *area; 1067 struct free_area *area;
1064 int current_order; 1068 int current_order;
1065 struct page *page; 1069 struct page *page;
1066 int migratetype, new_type, i; 1070 int migratetype, new_type, i;
1067 1071
1068 /* Find the largest possible block of pages in the other list */ 1072 /* Find the largest possible block of pages in the other list */
1069 for (current_order = MAX_ORDER-1; current_order >= order; 1073 for (current_order = MAX_ORDER-1; current_order >= order;
1070 --current_order) { 1074 --current_order) {
1071 for (i = 0;; i++) { 1075 for (i = 0;; i++) {
1072 migratetype = fallbacks[start_migratetype][i]; 1076 migratetype = fallbacks[start_migratetype][i];
1073 1077
1074 /* MIGRATE_RESERVE handled later if necessary */ 1078 /* MIGRATE_RESERVE handled later if necessary */
1075 if (migratetype == MIGRATE_RESERVE) 1079 if (migratetype == MIGRATE_RESERVE)
1076 break; 1080 break;
1077 1081
1078 area = &(zone->free_area[current_order]); 1082 area = &(zone->free_area[current_order]);
1079 if (list_empty(&area->free_list[migratetype])) 1083 if (list_empty(&area->free_list[migratetype]))
1080 continue; 1084 continue;
1081 1085
1082 page = list_entry(area->free_list[migratetype].next, 1086 page = list_entry(area->free_list[migratetype].next,
1083 struct page, lru); 1087 struct page, lru);
1084 area->nr_free--; 1088 area->nr_free--;
1085 1089
1086 new_type = try_to_steal_freepages(zone, page, 1090 new_type = try_to_steal_freepages(zone, page,
1087 start_migratetype, 1091 start_migratetype,
1088 migratetype); 1092 migratetype);
1089 1093
1090 /* Remove the page from the freelists */ 1094 /* Remove the page from the freelists */
1091 list_del(&page->lru); 1095 list_del(&page->lru);
1092 rmv_page_order(page); 1096 rmv_page_order(page);
1093 1097
1094 /*
1095 * Borrow the excess buddy pages as well, irrespective
1096 * of whether we stole freepages, or took ownership of
1097 * the pageblock or not.
1098 *
1099 * Exception: When borrowing from MIGRATE_CMA, release
1100 * the excess buddy pages to CMA itself.
1101 */
1102 expand(zone, page, order, current_order, area, 1098 expand(zone, page, order, current_order, area,
1103 is_migrate_cma(migratetype) 1099 new_type);
1104 ? migratetype : start_migratetype);
1105 1100
1106 trace_mm_page_alloc_extfrag(page, order, current_order, 1101 trace_mm_page_alloc_extfrag(page, order, current_order,
1107 start_migratetype, migratetype, new_type); 1102 start_migratetype, migratetype, new_type);
1108 1103
1109 return page; 1104 return page;
1110 } 1105 }
1111 } 1106 }
1112 1107
1113 return NULL; 1108 return NULL;
1114 } 1109 }
1115 1110
1116 /* 1111 /*
1117 * Do the hard work of removing an element from the buddy allocator. 1112 * Do the hard work of removing an element from the buddy allocator.
1118 * Call me with the zone->lock already held. 1113 * Call me with the zone->lock already held.
1119 */ 1114 */
1120 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1115 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1121 int migratetype) 1116 int migratetype)
1122 { 1117 {
1123 struct page *page; 1118 struct page *page;
1124 1119
1125 retry_reserve: 1120 retry_reserve:
1126 page = __rmqueue_smallest(zone, order, migratetype); 1121 page = __rmqueue_smallest(zone, order, migratetype);
1127 1122
1128 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1123 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1129 page = __rmqueue_fallback(zone, order, migratetype); 1124 page = __rmqueue_fallback(zone, order, migratetype);
1130 1125
1131 /* 1126 /*
1132 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1127 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1133 * is used because __rmqueue_smallest is an inline function 1128 * is used because __rmqueue_smallest is an inline function
1134 * and we want just one call site 1129 * and we want just one call site
1135 */ 1130 */
1136 if (!page) { 1131 if (!page) {
1137 migratetype = MIGRATE_RESERVE; 1132 migratetype = MIGRATE_RESERVE;
1138 goto retry_reserve; 1133 goto retry_reserve;
1139 } 1134 }
1140 } 1135 }
1141 1136
1142 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1137 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1143 return page; 1138 return page;
1144 } 1139 }
1145 1140
1146 /* 1141 /*
1147 * Obtain a specified number of elements from the buddy allocator, all under 1142 * Obtain a specified number of elements from the buddy allocator, all under
1148 * a single hold of the lock, for efficiency. Add them to the supplied list. 1143 * a single hold of the lock, for efficiency. Add them to the supplied list.
1149 * Returns the number of new pages which were placed at *list. 1144 * Returns the number of new pages which were placed at *list.
1150 */ 1145 */
1151 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1146 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1152 unsigned long count, struct list_head *list, 1147 unsigned long count, struct list_head *list,
1153 int migratetype, int cold) 1148 int migratetype, int cold)
1154 { 1149 {
1155 int mt = migratetype, i; 1150 int mt = migratetype, i;
1156 1151
1157 spin_lock(&zone->lock); 1152 spin_lock(&zone->lock);
1158 for (i = 0; i < count; ++i) { 1153 for (i = 0; i < count; ++i) {
1159 struct page *page = __rmqueue(zone, order, migratetype); 1154 struct page *page = __rmqueue(zone, order, migratetype);
1160 if (unlikely(page == NULL)) 1155 if (unlikely(page == NULL))
1161 break; 1156 break;
1162 1157
1163 /* 1158 /*
1164 * Split buddy pages returned by expand() are received here 1159 * Split buddy pages returned by expand() are received here
1165 * in physical page order. The page is added to the callers and 1160 * in physical page order. The page is added to the callers and
1166 * list and the list head then moves forward. From the callers 1161 * list and the list head then moves forward. From the callers
1167 * perspective, the linked list is ordered by page number in 1162 * perspective, the linked list is ordered by page number in
1168 * some conditions. This is useful for IO devices that can 1163 * some conditions. This is useful for IO devices that can
1169 * merge IO requests if the physical pages are ordered 1164 * merge IO requests if the physical pages are ordered
1170 * properly. 1165 * properly.
1171 */ 1166 */
1172 if (likely(cold == 0)) 1167 if (likely(cold == 0))
1173 list_add(&page->lru, list); 1168 list_add(&page->lru, list);
1174 else 1169 else
1175 list_add_tail(&page->lru, list); 1170 list_add_tail(&page->lru, list);
1176 if (IS_ENABLED(CONFIG_CMA)) { 1171 if (IS_ENABLED(CONFIG_CMA)) {
1177 mt = get_pageblock_migratetype(page); 1172 mt = get_pageblock_migratetype(page);
1178 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) 1173 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1179 mt = migratetype; 1174 mt = migratetype;
1180 } 1175 }
1181 set_freepage_migratetype(page, mt); 1176 set_freepage_migratetype(page, mt);
1182 list = &page->lru; 1177 list = &page->lru;
1183 if (is_migrate_cma(mt)) 1178 if (is_migrate_cma(mt))
1184 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1179 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1185 -(1 << order)); 1180 -(1 << order));
1186 } 1181 }
1187 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1182 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1188 spin_unlock(&zone->lock); 1183 spin_unlock(&zone->lock);
1189 return i; 1184 return i;
1190 } 1185 }
1191 1186
1192 #ifdef CONFIG_NUMA 1187 #ifdef CONFIG_NUMA
1193 /* 1188 /*
1194 * Called from the vmstat counter updater to drain pagesets of this 1189 * Called from the vmstat counter updater to drain pagesets of this
1195 * currently executing processor on remote nodes after they have 1190 * currently executing processor on remote nodes after they have
1196 * expired. 1191 * expired.
1197 * 1192 *
1198 * Note that this function must be called with the thread pinned to 1193 * Note that this function must be called with the thread pinned to
1199 * a single processor. 1194 * a single processor.
1200 */ 1195 */
1201 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1196 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1202 { 1197 {
1203 unsigned long flags; 1198 unsigned long flags;
1204 int to_drain; 1199 int to_drain;
1205 unsigned long batch; 1200 unsigned long batch;
1206 1201
1207 local_irq_save(flags); 1202 local_irq_save(flags);
1208 batch = ACCESS_ONCE(pcp->batch); 1203 batch = ACCESS_ONCE(pcp->batch);
1209 if (pcp->count >= batch) 1204 if (pcp->count >= batch)
1210 to_drain = batch; 1205 to_drain = batch;
1211 else 1206 else
1212 to_drain = pcp->count; 1207 to_drain = pcp->count;
1213 if (to_drain > 0) { 1208 if (to_drain > 0) {
1214 free_pcppages_bulk(zone, to_drain, pcp); 1209 free_pcppages_bulk(zone, to_drain, pcp);
1215 pcp->count -= to_drain; 1210 pcp->count -= to_drain;
1216 } 1211 }
1217 local_irq_restore(flags); 1212 local_irq_restore(flags);
1218 } 1213 }
1219 #endif 1214 #endif
1220 1215
1221 /* 1216 /*
1222 * Drain pages of the indicated processor. 1217 * Drain pages of the indicated processor.
1223 * 1218 *
1224 * The processor must either be the current processor and the 1219 * The processor must either be the current processor and the
1225 * thread pinned to the current processor or a processor that 1220 * thread pinned to the current processor or a processor that
1226 * is not online. 1221 * is not online.
1227 */ 1222 */
1228 static void drain_pages(unsigned int cpu) 1223 static void drain_pages(unsigned int cpu)
1229 { 1224 {
1230 unsigned long flags; 1225 unsigned long flags;
1231 struct zone *zone; 1226 struct zone *zone;
1232 1227
1233 for_each_populated_zone(zone) { 1228 for_each_populated_zone(zone) {
1234 struct per_cpu_pageset *pset; 1229 struct per_cpu_pageset *pset;
1235 struct per_cpu_pages *pcp; 1230 struct per_cpu_pages *pcp;
1236 1231
1237 local_irq_save(flags); 1232 local_irq_save(flags);
1238 pset = per_cpu_ptr(zone->pageset, cpu); 1233 pset = per_cpu_ptr(zone->pageset, cpu);
1239 1234
1240 pcp = &pset->pcp; 1235 pcp = &pset->pcp;
1241 if (pcp->count) { 1236 if (pcp->count) {
1242 free_pcppages_bulk(zone, pcp->count, pcp); 1237 free_pcppages_bulk(zone, pcp->count, pcp);
1243 pcp->count = 0; 1238 pcp->count = 0;
1244 } 1239 }
1245 local_irq_restore(flags); 1240 local_irq_restore(flags);
1246 } 1241 }
1247 } 1242 }
1248 1243
1249 /* 1244 /*
1250 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1245 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1251 */ 1246 */
1252 void drain_local_pages(void *arg) 1247 void drain_local_pages(void *arg)
1253 { 1248 {
1254 drain_pages(smp_processor_id()); 1249 drain_pages(smp_processor_id());
1255 } 1250 }
1256 1251
1257 /* 1252 /*
1258 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1253 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1259 * 1254 *
1260 * Note that this code is protected against sending an IPI to an offline 1255 * Note that this code is protected against sending an IPI to an offline
1261 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1256 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1262 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1257 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1263 * nothing keeps CPUs from showing up after we populated the cpumask and 1258 * nothing keeps CPUs from showing up after we populated the cpumask and
1264 * before the call to on_each_cpu_mask(). 1259 * before the call to on_each_cpu_mask().
1265 */ 1260 */
1266 void drain_all_pages(void) 1261 void drain_all_pages(void)
1267 { 1262 {
1268 int cpu; 1263 int cpu;
1269 struct per_cpu_pageset *pcp; 1264 struct per_cpu_pageset *pcp;
1270 struct zone *zone; 1265 struct zone *zone;
1271 1266
1272 /* 1267 /*
1273 * Allocate in the BSS so we wont require allocation in 1268 * Allocate in the BSS so we wont require allocation in
1274 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1269 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1275 */ 1270 */
1276 static cpumask_t cpus_with_pcps; 1271 static cpumask_t cpus_with_pcps;
1277 1272
1278 /* 1273 /*
1279 * We don't care about racing with CPU hotplug event 1274 * We don't care about racing with CPU hotplug event
1280 * as offline notification will cause the notified 1275 * as offline notification will cause the notified
1281 * cpu to drain that CPU pcps and on_each_cpu_mask 1276 * cpu to drain that CPU pcps and on_each_cpu_mask
1282 * disables preemption as part of its processing 1277 * disables preemption as part of its processing
1283 */ 1278 */
1284 for_each_online_cpu(cpu) { 1279 for_each_online_cpu(cpu) {
1285 bool has_pcps = false; 1280 bool has_pcps = false;
1286 for_each_populated_zone(zone) { 1281 for_each_populated_zone(zone) {
1287 pcp = per_cpu_ptr(zone->pageset, cpu); 1282 pcp = per_cpu_ptr(zone->pageset, cpu);
1288 if (pcp->pcp.count) { 1283 if (pcp->pcp.count) {
1289 has_pcps = true; 1284 has_pcps = true;
1290 break; 1285 break;
1291 } 1286 }
1292 } 1287 }
1293 if (has_pcps) 1288 if (has_pcps)
1294 cpumask_set_cpu(cpu, &cpus_with_pcps); 1289 cpumask_set_cpu(cpu, &cpus_with_pcps);
1295 else 1290 else
1296 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1291 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1297 } 1292 }
1298 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1293 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1299 } 1294 }
1300 1295
1301 #ifdef CONFIG_HIBERNATION 1296 #ifdef CONFIG_HIBERNATION
1302 1297
1303 void mark_free_pages(struct zone *zone) 1298 void mark_free_pages(struct zone *zone)
1304 { 1299 {
1305 unsigned long pfn, max_zone_pfn; 1300 unsigned long pfn, max_zone_pfn;
1306 unsigned long flags; 1301 unsigned long flags;
1307 int order, t; 1302 int order, t;
1308 struct list_head *curr; 1303 struct list_head *curr;
1309 1304
1310 if (zone_is_empty(zone)) 1305 if (zone_is_empty(zone))
1311 return; 1306 return;
1312 1307
1313 spin_lock_irqsave(&zone->lock, flags); 1308 spin_lock_irqsave(&zone->lock, flags);
1314 1309
1315 max_zone_pfn = zone_end_pfn(zone); 1310 max_zone_pfn = zone_end_pfn(zone);
1316 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1311 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1317 if (pfn_valid(pfn)) { 1312 if (pfn_valid(pfn)) {
1318 struct page *page = pfn_to_page(pfn); 1313 struct page *page = pfn_to_page(pfn);
1319 1314
1320 if (!swsusp_page_is_forbidden(page)) 1315 if (!swsusp_page_is_forbidden(page))
1321 swsusp_unset_page_free(page); 1316 swsusp_unset_page_free(page);
1322 } 1317 }
1323 1318
1324 for_each_migratetype_order(order, t) { 1319 for_each_migratetype_order(order, t) {
1325 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1320 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1326 unsigned long i; 1321 unsigned long i;
1327 1322
1328 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1323 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1329 for (i = 0; i < (1UL << order); i++) 1324 for (i = 0; i < (1UL << order); i++)
1330 swsusp_set_page_free(pfn_to_page(pfn + i)); 1325 swsusp_set_page_free(pfn_to_page(pfn + i));
1331 } 1326 }
1332 } 1327 }
1333 spin_unlock_irqrestore(&zone->lock, flags); 1328 spin_unlock_irqrestore(&zone->lock, flags);
1334 } 1329 }
1335 #endif /* CONFIG_PM */ 1330 #endif /* CONFIG_PM */
1336 1331
1337 /* 1332 /*
1338 * Free a 0-order page 1333 * Free a 0-order page
1339 * cold == 1 ? free a cold page : free a hot page 1334 * cold == 1 ? free a cold page : free a hot page
1340 */ 1335 */
1341 void free_hot_cold_page(struct page *page, int cold) 1336 void free_hot_cold_page(struct page *page, int cold)
1342 { 1337 {
1343 struct zone *zone = page_zone(page); 1338 struct zone *zone = page_zone(page);
1344 struct per_cpu_pages *pcp; 1339 struct per_cpu_pages *pcp;
1345 unsigned long flags; 1340 unsigned long flags;
1346 int migratetype; 1341 int migratetype;
1347 1342
1348 if (!free_pages_prepare(page, 0)) 1343 if (!free_pages_prepare(page, 0))
1349 return; 1344 return;
1350 1345
1351 migratetype = get_pageblock_migratetype(page); 1346 migratetype = get_pageblock_migratetype(page);
1352 set_freepage_migratetype(page, migratetype); 1347 set_freepage_migratetype(page, migratetype);
1353 local_irq_save(flags); 1348 local_irq_save(flags);
1354 __count_vm_event(PGFREE); 1349 __count_vm_event(PGFREE);
1355 1350
1356 /* 1351 /*
1357 * We only track unmovable, reclaimable and movable on pcp lists. 1352 * We only track unmovable, reclaimable and movable on pcp lists.
1358 * Free ISOLATE pages back to the allocator because they are being 1353 * Free ISOLATE pages back to the allocator because they are being
1359 * offlined but treat RESERVE as movable pages so we can get those 1354 * offlined but treat RESERVE as movable pages so we can get those
1360 * areas back if necessary. Otherwise, we may have to free 1355 * areas back if necessary. Otherwise, we may have to free
1361 * excessively into the page allocator 1356 * excessively into the page allocator
1362 */ 1357 */
1363 if (migratetype >= MIGRATE_PCPTYPES) { 1358 if (migratetype >= MIGRATE_PCPTYPES) {
1364 if (unlikely(is_migrate_isolate(migratetype))) { 1359 if (unlikely(is_migrate_isolate(migratetype))) {
1365 free_one_page(zone, page, 0, migratetype); 1360 free_one_page(zone, page, 0, migratetype);
1366 goto out; 1361 goto out;
1367 } 1362 }
1368 migratetype = MIGRATE_MOVABLE; 1363 migratetype = MIGRATE_MOVABLE;
1369 } 1364 }
1370 1365
1371 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1366 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1372 if (cold) 1367 if (cold)
1373 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1368 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1374 else 1369 else
1375 list_add(&page->lru, &pcp->lists[migratetype]); 1370 list_add(&page->lru, &pcp->lists[migratetype]);
1376 pcp->count++; 1371 pcp->count++;
1377 if (pcp->count >= pcp->high) { 1372 if (pcp->count >= pcp->high) {
1378 unsigned long batch = ACCESS_ONCE(pcp->batch); 1373 unsigned long batch = ACCESS_ONCE(pcp->batch);
1379 free_pcppages_bulk(zone, batch, pcp); 1374 free_pcppages_bulk(zone, batch, pcp);
1380 pcp->count -= batch; 1375 pcp->count -= batch;
1381 } 1376 }
1382 1377
1383 out: 1378 out:
1384 local_irq_restore(flags); 1379 local_irq_restore(flags);
1385 } 1380 }
1386 1381
1387 /* 1382 /*
1388 * Free a list of 0-order pages 1383 * Free a list of 0-order pages
1389 */ 1384 */
1390 void free_hot_cold_page_list(struct list_head *list, int cold) 1385 void free_hot_cold_page_list(struct list_head *list, int cold)
1391 { 1386 {
1392 struct page *page, *next; 1387 struct page *page, *next;
1393 1388
1394 list_for_each_entry_safe(page, next, list, lru) { 1389 list_for_each_entry_safe(page, next, list, lru) {
1395 trace_mm_page_free_batched(page, cold); 1390 trace_mm_page_free_batched(page, cold);
1396 free_hot_cold_page(page, cold); 1391 free_hot_cold_page(page, cold);
1397 } 1392 }
1398 } 1393 }
1399 1394
1400 /* 1395 /*
1401 * split_page takes a non-compound higher-order page, and splits it into 1396 * split_page takes a non-compound higher-order page, and splits it into
1402 * n (1<<order) sub-pages: page[0..n] 1397 * n (1<<order) sub-pages: page[0..n]
1403 * Each sub-page must be freed individually. 1398 * Each sub-page must be freed individually.
1404 * 1399 *
1405 * Note: this is probably too low level an operation for use in drivers. 1400 * Note: this is probably too low level an operation for use in drivers.
1406 * Please consult with lkml before using this in your driver. 1401 * Please consult with lkml before using this in your driver.
1407 */ 1402 */
1408 void split_page(struct page *page, unsigned int order) 1403 void split_page(struct page *page, unsigned int order)
1409 { 1404 {
1410 int i; 1405 int i;
1411 1406
1412 VM_BUG_ON(PageCompound(page)); 1407 VM_BUG_ON(PageCompound(page));
1413 VM_BUG_ON(!page_count(page)); 1408 VM_BUG_ON(!page_count(page));
1414 1409
1415 #ifdef CONFIG_KMEMCHECK 1410 #ifdef CONFIG_KMEMCHECK
1416 /* 1411 /*
1417 * Split shadow pages too, because free(page[0]) would 1412 * Split shadow pages too, because free(page[0]) would
1418 * otherwise free the whole shadow. 1413 * otherwise free the whole shadow.
1419 */ 1414 */
1420 if (kmemcheck_page_is_tracked(page)) 1415 if (kmemcheck_page_is_tracked(page))
1421 split_page(virt_to_page(page[0].shadow), order); 1416 split_page(virt_to_page(page[0].shadow), order);
1422 #endif 1417 #endif
1423 1418
1424 for (i = 1; i < (1 << order); i++) 1419 for (i = 1; i < (1 << order); i++)
1425 set_page_refcounted(page + i); 1420 set_page_refcounted(page + i);
1426 } 1421 }
1427 EXPORT_SYMBOL_GPL(split_page); 1422 EXPORT_SYMBOL_GPL(split_page);
1428 1423
1429 static int __isolate_free_page(struct page *page, unsigned int order) 1424 static int __isolate_free_page(struct page *page, unsigned int order)
1430 { 1425 {
1431 unsigned long watermark; 1426 unsigned long watermark;
1432 struct zone *zone; 1427 struct zone *zone;
1433 int mt; 1428 int mt;
1434 1429
1435 BUG_ON(!PageBuddy(page)); 1430 BUG_ON(!PageBuddy(page));
1436 1431
1437 zone = page_zone(page); 1432 zone = page_zone(page);
1438 mt = get_pageblock_migratetype(page); 1433 mt = get_pageblock_migratetype(page);
1439 1434
1440 if (!is_migrate_isolate(mt)) { 1435 if (!is_migrate_isolate(mt)) {
1441 /* Obey watermarks as if the page was being allocated */ 1436 /* Obey watermarks as if the page was being allocated */
1442 watermark = low_wmark_pages(zone) + (1 << order); 1437 watermark = low_wmark_pages(zone) + (1 << order);
1443 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1438 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1444 return 0; 1439 return 0;
1445 1440
1446 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1441 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1447 } 1442 }
1448 1443
1449 /* Remove page from free list */ 1444 /* Remove page from free list */
1450 list_del(&page->lru); 1445 list_del(&page->lru);
1451 zone->free_area[order].nr_free--; 1446 zone->free_area[order].nr_free--;
1452 rmv_page_order(page); 1447 rmv_page_order(page);
1453 1448
1454 /* Set the pageblock if the isolated page is at least a pageblock */ 1449 /* Set the pageblock if the isolated page is at least a pageblock */
1455 if (order >= pageblock_order - 1) { 1450 if (order >= pageblock_order - 1) {
1456 struct page *endpage = page + (1 << order) - 1; 1451 struct page *endpage = page + (1 << order) - 1;
1457 for (; page < endpage; page += pageblock_nr_pages) { 1452 for (; page < endpage; page += pageblock_nr_pages) {
1458 int mt = get_pageblock_migratetype(page); 1453 int mt = get_pageblock_migratetype(page);
1459 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1454 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1460 set_pageblock_migratetype(page, 1455 set_pageblock_migratetype(page,
1461 MIGRATE_MOVABLE); 1456 MIGRATE_MOVABLE);
1462 } 1457 }
1463 } 1458 }
1464 1459
1465 return 1UL << order; 1460 return 1UL << order;
1466 } 1461 }
1467 1462
1468 /* 1463 /*
1469 * Similar to split_page except the page is already free. As this is only 1464 * Similar to split_page except the page is already free. As this is only
1470 * being used for migration, the migratetype of the block also changes. 1465 * being used for migration, the migratetype of the block also changes.
1471 * As this is called with interrupts disabled, the caller is responsible 1466 * As this is called with interrupts disabled, the caller is responsible
1472 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1467 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1473 * are enabled. 1468 * are enabled.
1474 * 1469 *
1475 * Note: this is probably too low level an operation for use in drivers. 1470 * Note: this is probably too low level an operation for use in drivers.
1476 * Please consult with lkml before using this in your driver. 1471 * Please consult with lkml before using this in your driver.
1477 */ 1472 */
1478 int split_free_page(struct page *page) 1473 int split_free_page(struct page *page)
1479 { 1474 {
1480 unsigned int order; 1475 unsigned int order;
1481 int nr_pages; 1476 int nr_pages;
1482 1477
1483 order = page_order(page); 1478 order = page_order(page);
1484 1479
1485 nr_pages = __isolate_free_page(page, order); 1480 nr_pages = __isolate_free_page(page, order);
1486 if (!nr_pages) 1481 if (!nr_pages)
1487 return 0; 1482 return 0;
1488 1483
1489 /* Split into individual pages */ 1484 /* Split into individual pages */
1490 set_page_refcounted(page); 1485 set_page_refcounted(page);
1491 split_page(page, order); 1486 split_page(page, order);
1492 return nr_pages; 1487 return nr_pages;
1493 } 1488 }
1494 1489
1495 /* 1490 /*
1496 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1491 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1497 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1492 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1498 * or two. 1493 * or two.
1499 */ 1494 */
1500 static inline 1495 static inline
1501 struct page *buffered_rmqueue(struct zone *preferred_zone, 1496 struct page *buffered_rmqueue(struct zone *preferred_zone,
1502 struct zone *zone, int order, gfp_t gfp_flags, 1497 struct zone *zone, int order, gfp_t gfp_flags,
1503 int migratetype) 1498 int migratetype)
1504 { 1499 {
1505 unsigned long flags; 1500 unsigned long flags;
1506 struct page *page; 1501 struct page *page;
1507 int cold = !!(gfp_flags & __GFP_COLD); 1502 int cold = !!(gfp_flags & __GFP_COLD);
1508 1503
1509 again: 1504 again:
1510 if (likely(order == 0)) { 1505 if (likely(order == 0)) {
1511 struct per_cpu_pages *pcp; 1506 struct per_cpu_pages *pcp;
1512 struct list_head *list; 1507 struct list_head *list;
1513 1508
1514 local_irq_save(flags); 1509 local_irq_save(flags);
1515 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1510 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1516 list = &pcp->lists[migratetype]; 1511 list = &pcp->lists[migratetype];
1517 if (list_empty(list)) { 1512 if (list_empty(list)) {
1518 pcp->count += rmqueue_bulk(zone, 0, 1513 pcp->count += rmqueue_bulk(zone, 0,
1519 pcp->batch, list, 1514 pcp->batch, list,
1520 migratetype, cold); 1515 migratetype, cold);
1521 if (unlikely(list_empty(list))) 1516 if (unlikely(list_empty(list)))
1522 goto failed; 1517 goto failed;
1523 } 1518 }
1524 1519
1525 if (cold) 1520 if (cold)
1526 page = list_entry(list->prev, struct page, lru); 1521 page = list_entry(list->prev, struct page, lru);
1527 else 1522 else
1528 page = list_entry(list->next, struct page, lru); 1523 page = list_entry(list->next, struct page, lru);
1529 1524
1530 list_del(&page->lru); 1525 list_del(&page->lru);
1531 pcp->count--; 1526 pcp->count--;
1532 } else { 1527 } else {
1533 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1528 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1534 /* 1529 /*
1535 * __GFP_NOFAIL is not to be used in new code. 1530 * __GFP_NOFAIL is not to be used in new code.
1536 * 1531 *
1537 * All __GFP_NOFAIL callers should be fixed so that they 1532 * All __GFP_NOFAIL callers should be fixed so that they
1538 * properly detect and handle allocation failures. 1533 * properly detect and handle allocation failures.
1539 * 1534 *
1540 * We most definitely don't want callers attempting to 1535 * We most definitely don't want callers attempting to
1541 * allocate greater than order-1 page units with 1536 * allocate greater than order-1 page units with
1542 * __GFP_NOFAIL. 1537 * __GFP_NOFAIL.
1543 */ 1538 */
1544 WARN_ON_ONCE(order > 1); 1539 WARN_ON_ONCE(order > 1);
1545 } 1540 }
1546 spin_lock_irqsave(&zone->lock, flags); 1541 spin_lock_irqsave(&zone->lock, flags);
1547 page = __rmqueue(zone, order, migratetype); 1542 page = __rmqueue(zone, order, migratetype);
1548 spin_unlock(&zone->lock); 1543 spin_unlock(&zone->lock);
1549 if (!page) 1544 if (!page)
1550 goto failed; 1545 goto failed;
1551 __mod_zone_freepage_state(zone, -(1 << order), 1546 __mod_zone_freepage_state(zone, -(1 << order),
1552 get_pageblock_migratetype(page)); 1547 get_pageblock_migratetype(page));
1553 } 1548 }
1554 1549
1555 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1550 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1556 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1551 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1557 zone_statistics(preferred_zone, zone, gfp_flags); 1552 zone_statistics(preferred_zone, zone, gfp_flags);
1558 local_irq_restore(flags); 1553 local_irq_restore(flags);
1559 1554
1560 VM_BUG_ON(bad_range(zone, page)); 1555 VM_BUG_ON(bad_range(zone, page));
1561 if (prep_new_page(page, order, gfp_flags)) 1556 if (prep_new_page(page, order, gfp_flags))
1562 goto again; 1557 goto again;
1563 return page; 1558 return page;
1564 1559
1565 failed: 1560 failed:
1566 local_irq_restore(flags); 1561 local_irq_restore(flags);
1567 return NULL; 1562 return NULL;
1568 } 1563 }
1569 1564
1570 #ifdef CONFIG_FAIL_PAGE_ALLOC 1565 #ifdef CONFIG_FAIL_PAGE_ALLOC
1571 1566
1572 static struct { 1567 static struct {
1573 struct fault_attr attr; 1568 struct fault_attr attr;
1574 1569
1575 u32 ignore_gfp_highmem; 1570 u32 ignore_gfp_highmem;
1576 u32 ignore_gfp_wait; 1571 u32 ignore_gfp_wait;
1577 u32 min_order; 1572 u32 min_order;
1578 } fail_page_alloc = { 1573 } fail_page_alloc = {
1579 .attr = FAULT_ATTR_INITIALIZER, 1574 .attr = FAULT_ATTR_INITIALIZER,
1580 .ignore_gfp_wait = 1, 1575 .ignore_gfp_wait = 1,
1581 .ignore_gfp_highmem = 1, 1576 .ignore_gfp_highmem = 1,
1582 .min_order = 1, 1577 .min_order = 1,
1583 }; 1578 };
1584 1579
1585 static int __init setup_fail_page_alloc(char *str) 1580 static int __init setup_fail_page_alloc(char *str)
1586 { 1581 {
1587 return setup_fault_attr(&fail_page_alloc.attr, str); 1582 return setup_fault_attr(&fail_page_alloc.attr, str);
1588 } 1583 }
1589 __setup("fail_page_alloc=", setup_fail_page_alloc); 1584 __setup("fail_page_alloc=", setup_fail_page_alloc);
1590 1585
1591 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1586 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1592 { 1587 {
1593 if (order < fail_page_alloc.min_order) 1588 if (order < fail_page_alloc.min_order)
1594 return false; 1589 return false;
1595 if (gfp_mask & __GFP_NOFAIL) 1590 if (gfp_mask & __GFP_NOFAIL)
1596 return false; 1591 return false;
1597 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1592 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1598 return false; 1593 return false;
1599 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1594 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1600 return false; 1595 return false;
1601 1596
1602 return should_fail(&fail_page_alloc.attr, 1 << order); 1597 return should_fail(&fail_page_alloc.attr, 1 << order);
1603 } 1598 }
1604 1599
1605 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1600 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1606 1601
1607 static int __init fail_page_alloc_debugfs(void) 1602 static int __init fail_page_alloc_debugfs(void)
1608 { 1603 {
1609 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1604 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1610 struct dentry *dir; 1605 struct dentry *dir;
1611 1606
1612 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1607 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1613 &fail_page_alloc.attr); 1608 &fail_page_alloc.attr);
1614 if (IS_ERR(dir)) 1609 if (IS_ERR(dir))
1615 return PTR_ERR(dir); 1610 return PTR_ERR(dir);
1616 1611
1617 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1612 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1618 &fail_page_alloc.ignore_gfp_wait)) 1613 &fail_page_alloc.ignore_gfp_wait))
1619 goto fail; 1614 goto fail;
1620 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1615 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1621 &fail_page_alloc.ignore_gfp_highmem)) 1616 &fail_page_alloc.ignore_gfp_highmem))
1622 goto fail; 1617 goto fail;
1623 if (!debugfs_create_u32("min-order", mode, dir, 1618 if (!debugfs_create_u32("min-order", mode, dir,
1624 &fail_page_alloc.min_order)) 1619 &fail_page_alloc.min_order))
1625 goto fail; 1620 goto fail;
1626 1621
1627 return 0; 1622 return 0;
1628 fail: 1623 fail:
1629 debugfs_remove_recursive(dir); 1624 debugfs_remove_recursive(dir);
1630 1625
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 1628
1634 late_initcall(fail_page_alloc_debugfs); 1629 late_initcall(fail_page_alloc_debugfs);
1635 1630
1636 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1631 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1637 1632
1638 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1633 #else /* CONFIG_FAIL_PAGE_ALLOC */
1639 1634
1640 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1635 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1641 { 1636 {
1642 return false; 1637 return false;
1643 } 1638 }
1644 1639
1645 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1640 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1646 1641
1647 /* 1642 /*
1648 * Return true if free pages are above 'mark'. This takes into account the order 1643 * Return true if free pages are above 'mark'. This takes into account the order
1649 * of the allocation. 1644 * of the allocation.
1650 */ 1645 */
1651 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1646 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1652 int classzone_idx, int alloc_flags, long free_pages) 1647 int classzone_idx, int alloc_flags, long free_pages)
1653 { 1648 {
1654 /* free_pages my go negative - that's OK */ 1649 /* free_pages my go negative - that's OK */
1655 long min = mark; 1650 long min = mark;
1656 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1651 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1657 int o; 1652 int o;
1658 long free_cma = 0; 1653 long free_cma = 0;
1659 1654
1660 free_pages -= (1 << order) - 1; 1655 free_pages -= (1 << order) - 1;
1661 if (alloc_flags & ALLOC_HIGH) 1656 if (alloc_flags & ALLOC_HIGH)
1662 min -= min / 2; 1657 min -= min / 2;
1663 if (alloc_flags & ALLOC_HARDER) 1658 if (alloc_flags & ALLOC_HARDER)
1664 min -= min / 4; 1659 min -= min / 4;
1665 #ifdef CONFIG_CMA 1660 #ifdef CONFIG_CMA
1666 /* If allocation can't use CMA areas don't use free CMA pages */ 1661 /* If allocation can't use CMA areas don't use free CMA pages */
1667 if (!(alloc_flags & ALLOC_CMA)) 1662 if (!(alloc_flags & ALLOC_CMA))
1668 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1663 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1669 #endif 1664 #endif
1670 1665
1671 if (free_pages - free_cma <= min + lowmem_reserve) 1666 if (free_pages - free_cma <= min + lowmem_reserve)
1672 return false; 1667 return false;
1673 for (o = 0; o < order; o++) { 1668 for (o = 0; o < order; o++) {
1674 /* At the next order, this order's pages become unavailable */ 1669 /* At the next order, this order's pages become unavailable */
1675 free_pages -= z->free_area[o].nr_free << o; 1670 free_pages -= z->free_area[o].nr_free << o;
1676 1671
1677 /* Require fewer higher order pages to be free */ 1672 /* Require fewer higher order pages to be free */
1678 min >>= 1; 1673 min >>= 1;
1679 1674
1680 if (free_pages <= min) 1675 if (free_pages <= min)
1681 return false; 1676 return false;
1682 } 1677 }
1683 return true; 1678 return true;
1684 } 1679 }
1685 1680
1686 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1681 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1687 int classzone_idx, int alloc_flags) 1682 int classzone_idx, int alloc_flags)
1688 { 1683 {
1689 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1684 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1690 zone_page_state(z, NR_FREE_PAGES)); 1685 zone_page_state(z, NR_FREE_PAGES));
1691 } 1686 }
1692 1687
1693 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1688 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1694 int classzone_idx, int alloc_flags) 1689 int classzone_idx, int alloc_flags)
1695 { 1690 {
1696 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1691 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1697 1692
1698 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1693 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1699 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1694 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1700 1695
1701 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1696 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1702 free_pages); 1697 free_pages);
1703 } 1698 }
1704 1699
1705 #ifdef CONFIG_NUMA 1700 #ifdef CONFIG_NUMA
1706 /* 1701 /*
1707 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1702 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1708 * skip over zones that are not allowed by the cpuset, or that have 1703 * skip over zones that are not allowed by the cpuset, or that have
1709 * been recently (in last second) found to be nearly full. See further 1704 * been recently (in last second) found to be nearly full. See further
1710 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1705 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1711 * that have to skip over a lot of full or unallowed zones. 1706 * that have to skip over a lot of full or unallowed zones.
1712 * 1707 *
1713 * If the zonelist cache is present in the passed in zonelist, then 1708 * If the zonelist cache is present in the passed in zonelist, then
1714 * returns a pointer to the allowed node mask (either the current 1709 * returns a pointer to the allowed node mask (either the current
1715 * tasks mems_allowed, or node_states[N_MEMORY].) 1710 * tasks mems_allowed, or node_states[N_MEMORY].)
1716 * 1711 *
1717 * If the zonelist cache is not available for this zonelist, does 1712 * If the zonelist cache is not available for this zonelist, does
1718 * nothing and returns NULL. 1713 * nothing and returns NULL.
1719 * 1714 *
1720 * If the fullzones BITMAP in the zonelist cache is stale (more than 1715 * If the fullzones BITMAP in the zonelist cache is stale (more than
1721 * a second since last zap'd) then we zap it out (clear its bits.) 1716 * a second since last zap'd) then we zap it out (clear its bits.)
1722 * 1717 *
1723 * We hold off even calling zlc_setup, until after we've checked the 1718 * We hold off even calling zlc_setup, until after we've checked the
1724 * first zone in the zonelist, on the theory that most allocations will 1719 * first zone in the zonelist, on the theory that most allocations will
1725 * be satisfied from that first zone, so best to examine that zone as 1720 * be satisfied from that first zone, so best to examine that zone as
1726 * quickly as we can. 1721 * quickly as we can.
1727 */ 1722 */
1728 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1723 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1729 { 1724 {
1730 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1725 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1731 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1726 nodemask_t *allowednodes; /* zonelist_cache approximation */
1732 1727
1733 zlc = zonelist->zlcache_ptr; 1728 zlc = zonelist->zlcache_ptr;
1734 if (!zlc) 1729 if (!zlc)
1735 return NULL; 1730 return NULL;
1736 1731
1737 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1732 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1738 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1733 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1739 zlc->last_full_zap = jiffies; 1734 zlc->last_full_zap = jiffies;
1740 } 1735 }
1741 1736
1742 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1737 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1743 &cpuset_current_mems_allowed : 1738 &cpuset_current_mems_allowed :
1744 &node_states[N_MEMORY]; 1739 &node_states[N_MEMORY];
1745 return allowednodes; 1740 return allowednodes;
1746 } 1741 }
1747 1742
1748 /* 1743 /*
1749 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1744 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1750 * if it is worth looking at further for free memory: 1745 * if it is worth looking at further for free memory:
1751 * 1) Check that the zone isn't thought to be full (doesn't have its 1746 * 1) Check that the zone isn't thought to be full (doesn't have its
1752 * bit set in the zonelist_cache fullzones BITMAP). 1747 * bit set in the zonelist_cache fullzones BITMAP).
1753 * 2) Check that the zones node (obtained from the zonelist_cache 1748 * 2) Check that the zones node (obtained from the zonelist_cache
1754 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1749 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1755 * Return true (non-zero) if zone is worth looking at further, or 1750 * Return true (non-zero) if zone is worth looking at further, or
1756 * else return false (zero) if it is not. 1751 * else return false (zero) if it is not.
1757 * 1752 *
1758 * This check -ignores- the distinction between various watermarks, 1753 * This check -ignores- the distinction between various watermarks,
1759 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1754 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1760 * found to be full for any variation of these watermarks, it will 1755 * found to be full for any variation of these watermarks, it will
1761 * be considered full for up to one second by all requests, unless 1756 * be considered full for up to one second by all requests, unless
1762 * we are so low on memory on all allowed nodes that we are forced 1757 * we are so low on memory on all allowed nodes that we are forced
1763 * into the second scan of the zonelist. 1758 * into the second scan of the zonelist.
1764 * 1759 *
1765 * In the second scan we ignore this zonelist cache and exactly 1760 * In the second scan we ignore this zonelist cache and exactly
1766 * apply the watermarks to all zones, even it is slower to do so. 1761 * apply the watermarks to all zones, even it is slower to do so.
1767 * We are low on memory in the second scan, and should leave no stone 1762 * We are low on memory in the second scan, and should leave no stone
1768 * unturned looking for a free page. 1763 * unturned looking for a free page.
1769 */ 1764 */
1770 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1765 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1771 nodemask_t *allowednodes) 1766 nodemask_t *allowednodes)
1772 { 1767 {
1773 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1768 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1774 int i; /* index of *z in zonelist zones */ 1769 int i; /* index of *z in zonelist zones */
1775 int n; /* node that zone *z is on */ 1770 int n; /* node that zone *z is on */
1776 1771
1777 zlc = zonelist->zlcache_ptr; 1772 zlc = zonelist->zlcache_ptr;
1778 if (!zlc) 1773 if (!zlc)
1779 return 1; 1774 return 1;
1780 1775
1781 i = z - zonelist->_zonerefs; 1776 i = z - zonelist->_zonerefs;
1782 n = zlc->z_to_n[i]; 1777 n = zlc->z_to_n[i];
1783 1778
1784 /* This zone is worth trying if it is allowed but not full */ 1779 /* This zone is worth trying if it is allowed but not full */
1785 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1780 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1786 } 1781 }
1787 1782
1788 /* 1783 /*
1789 * Given 'z' scanning a zonelist, set the corresponding bit in 1784 * Given 'z' scanning a zonelist, set the corresponding bit in
1790 * zlc->fullzones, so that subsequent attempts to allocate a page 1785 * zlc->fullzones, so that subsequent attempts to allocate a page
1791 * from that zone don't waste time re-examining it. 1786 * from that zone don't waste time re-examining it.
1792 */ 1787 */
1793 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1788 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1794 { 1789 {
1795 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1790 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1796 int i; /* index of *z in zonelist zones */ 1791 int i; /* index of *z in zonelist zones */
1797 1792
1798 zlc = zonelist->zlcache_ptr; 1793 zlc = zonelist->zlcache_ptr;
1799 if (!zlc) 1794 if (!zlc)
1800 return; 1795 return;
1801 1796
1802 i = z - zonelist->_zonerefs; 1797 i = z - zonelist->_zonerefs;
1803 1798
1804 set_bit(i, zlc->fullzones); 1799 set_bit(i, zlc->fullzones);
1805 } 1800 }
1806 1801
1807 /* 1802 /*
1808 * clear all zones full, called after direct reclaim makes progress so that 1803 * clear all zones full, called after direct reclaim makes progress so that
1809 * a zone that was recently full is not skipped over for up to a second 1804 * a zone that was recently full is not skipped over for up to a second
1810 */ 1805 */
1811 static void zlc_clear_zones_full(struct zonelist *zonelist) 1806 static void zlc_clear_zones_full(struct zonelist *zonelist)
1812 { 1807 {
1813 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1808 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1814 1809
1815 zlc = zonelist->zlcache_ptr; 1810 zlc = zonelist->zlcache_ptr;
1816 if (!zlc) 1811 if (!zlc)
1817 return; 1812 return;
1818 1813
1819 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1814 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1820 } 1815 }
1821 1816
1822 static bool zone_local(struct zone *local_zone, struct zone *zone) 1817 static bool zone_local(struct zone *local_zone, struct zone *zone)
1823 { 1818 {
1824 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; 1819 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
1825 } 1820 }
1826 1821
1827 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1822 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1828 { 1823 {
1829 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1824 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1830 } 1825 }
1831 1826
1832 static void __paginginit init_zone_allows_reclaim(int nid) 1827 static void __paginginit init_zone_allows_reclaim(int nid)
1833 { 1828 {
1834 int i; 1829 int i;
1835 1830
1836 for_each_online_node(i) 1831 for_each_online_node(i)
1837 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1832 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1838 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1833 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1839 else 1834 else
1840 zone_reclaim_mode = 1; 1835 zone_reclaim_mode = 1;
1841 } 1836 }
1842 1837
1843 #else /* CONFIG_NUMA */ 1838 #else /* CONFIG_NUMA */
1844 1839
1845 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1840 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1846 { 1841 {
1847 return NULL; 1842 return NULL;
1848 } 1843 }
1849 1844
1850 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1845 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1851 nodemask_t *allowednodes) 1846 nodemask_t *allowednodes)
1852 { 1847 {
1853 return 1; 1848 return 1;
1854 } 1849 }
1855 1850
1856 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1851 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1857 { 1852 {
1858 } 1853 }
1859 1854
1860 static void zlc_clear_zones_full(struct zonelist *zonelist) 1855 static void zlc_clear_zones_full(struct zonelist *zonelist)
1861 { 1856 {
1862 } 1857 }
1863 1858
1864 static bool zone_local(struct zone *local_zone, struct zone *zone) 1859 static bool zone_local(struct zone *local_zone, struct zone *zone)
1865 { 1860 {
1866 return true; 1861 return true;
1867 } 1862 }
1868 1863
1869 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1864 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1870 { 1865 {
1871 return true; 1866 return true;
1872 } 1867 }
1873 1868
1874 static inline void init_zone_allows_reclaim(int nid) 1869 static inline void init_zone_allows_reclaim(int nid)
1875 { 1870 {
1876 } 1871 }
1877 #endif /* CONFIG_NUMA */ 1872 #endif /* CONFIG_NUMA */
1878 1873
1879 /* 1874 /*
1880 * get_page_from_freelist goes through the zonelist trying to allocate 1875 * get_page_from_freelist goes through the zonelist trying to allocate
1881 * a page. 1876 * a page.
1882 */ 1877 */
1883 static struct page * 1878 static struct page *
1884 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1879 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1885 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1880 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1886 struct zone *preferred_zone, int migratetype) 1881 struct zone *preferred_zone, int migratetype)
1887 { 1882 {
1888 struct zoneref *z; 1883 struct zoneref *z;
1889 struct page *page = NULL; 1884 struct page *page = NULL;
1890 int classzone_idx; 1885 int classzone_idx;
1891 struct zone *zone; 1886 struct zone *zone;
1892 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1887 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1893 int zlc_active = 0; /* set if using zonelist_cache */ 1888 int zlc_active = 0; /* set if using zonelist_cache */
1894 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1889 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1895 1890
1896 classzone_idx = zone_idx(preferred_zone); 1891 classzone_idx = zone_idx(preferred_zone);
1897 zonelist_scan: 1892 zonelist_scan:
1898 /* 1893 /*
1899 * Scan zonelist, looking for a zone with enough free. 1894 * Scan zonelist, looking for a zone with enough free.
1900 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1895 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1901 */ 1896 */
1902 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1897 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1903 high_zoneidx, nodemask) { 1898 high_zoneidx, nodemask) {
1904 unsigned long mark; 1899 unsigned long mark;
1905 1900
1906 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1901 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1907 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1902 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1908 continue; 1903 continue;
1909 if ((alloc_flags & ALLOC_CPUSET) && 1904 if ((alloc_flags & ALLOC_CPUSET) &&
1910 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1905 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1911 continue; 1906 continue;
1912 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1907 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1913 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) 1908 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1914 goto try_this_zone; 1909 goto try_this_zone;
1915 /* 1910 /*
1916 * Distribute pages in proportion to the individual 1911 * Distribute pages in proportion to the individual
1917 * zone size to ensure fair page aging. The zone a 1912 * zone size to ensure fair page aging. The zone a
1918 * page was allocated in should have no effect on the 1913 * page was allocated in should have no effect on the
1919 * time the page has in memory before being reclaimed. 1914 * time the page has in memory before being reclaimed.
1920 * 1915 *
1921 * When zone_reclaim_mode is enabled, try to stay in 1916 * When zone_reclaim_mode is enabled, try to stay in
1922 * local zones in the fastpath. If that fails, the 1917 * local zones in the fastpath. If that fails, the
1923 * slowpath is entered, which will do another pass 1918 * slowpath is entered, which will do another pass
1924 * starting with the local zones, but ultimately fall 1919 * starting with the local zones, but ultimately fall
1925 * back to remote zones that do not partake in the 1920 * back to remote zones that do not partake in the
1926 * fairness round-robin cycle of this zonelist. 1921 * fairness round-robin cycle of this zonelist.
1927 */ 1922 */
1928 if (alloc_flags & ALLOC_WMARK_LOW) { 1923 if (alloc_flags & ALLOC_WMARK_LOW) {
1929 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1924 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1930 continue; 1925 continue;
1931 if (zone_reclaim_mode && 1926 if (zone_reclaim_mode &&
1932 !zone_local(preferred_zone, zone)) 1927 !zone_local(preferred_zone, zone))
1933 continue; 1928 continue;
1934 } 1929 }
1935 /* 1930 /*
1936 * When allocating a page cache page for writing, we 1931 * When allocating a page cache page for writing, we
1937 * want to get it from a zone that is within its dirty 1932 * want to get it from a zone that is within its dirty
1938 * limit, such that no single zone holds more than its 1933 * limit, such that no single zone holds more than its
1939 * proportional share of globally allowed dirty pages. 1934 * proportional share of globally allowed dirty pages.
1940 * The dirty limits take into account the zone's 1935 * The dirty limits take into account the zone's
1941 * lowmem reserves and high watermark so that kswapd 1936 * lowmem reserves and high watermark so that kswapd
1942 * should be able to balance it without having to 1937 * should be able to balance it without having to
1943 * write pages from its LRU list. 1938 * write pages from its LRU list.
1944 * 1939 *
1945 * This may look like it could increase pressure on 1940 * This may look like it could increase pressure on
1946 * lower zones by failing allocations in higher zones 1941 * lower zones by failing allocations in higher zones
1947 * before they are full. But the pages that do spill 1942 * before they are full. But the pages that do spill
1948 * over are limited as the lower zones are protected 1943 * over are limited as the lower zones are protected
1949 * by this very same mechanism. It should not become 1944 * by this very same mechanism. It should not become
1950 * a practical burden to them. 1945 * a practical burden to them.
1951 * 1946 *
1952 * XXX: For now, allow allocations to potentially 1947 * XXX: For now, allow allocations to potentially
1953 * exceed the per-zone dirty limit in the slowpath 1948 * exceed the per-zone dirty limit in the slowpath
1954 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1949 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1955 * which is important when on a NUMA setup the allowed 1950 * which is important when on a NUMA setup the allowed
1956 * zones are together not big enough to reach the 1951 * zones are together not big enough to reach the
1957 * global limit. The proper fix for these situations 1952 * global limit. The proper fix for these situations
1958 * will require awareness of zones in the 1953 * will require awareness of zones in the
1959 * dirty-throttling and the flusher threads. 1954 * dirty-throttling and the flusher threads.
1960 */ 1955 */
1961 if ((alloc_flags & ALLOC_WMARK_LOW) && 1956 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1962 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1957 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1963 goto this_zone_full; 1958 goto this_zone_full;
1964 1959
1965 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1960 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1966 if (!zone_watermark_ok(zone, order, mark, 1961 if (!zone_watermark_ok(zone, order, mark,
1967 classzone_idx, alloc_flags)) { 1962 classzone_idx, alloc_flags)) {
1968 int ret; 1963 int ret;
1969 1964
1970 if (IS_ENABLED(CONFIG_NUMA) && 1965 if (IS_ENABLED(CONFIG_NUMA) &&
1971 !did_zlc_setup && nr_online_nodes > 1) { 1966 !did_zlc_setup && nr_online_nodes > 1) {
1972 /* 1967 /*
1973 * we do zlc_setup if there are multiple nodes 1968 * we do zlc_setup if there are multiple nodes
1974 * and before considering the first zone allowed 1969 * and before considering the first zone allowed
1975 * by the cpuset. 1970 * by the cpuset.
1976 */ 1971 */
1977 allowednodes = zlc_setup(zonelist, alloc_flags); 1972 allowednodes = zlc_setup(zonelist, alloc_flags);
1978 zlc_active = 1; 1973 zlc_active = 1;
1979 did_zlc_setup = 1; 1974 did_zlc_setup = 1;
1980 } 1975 }
1981 1976
1982 if (zone_reclaim_mode == 0 || 1977 if (zone_reclaim_mode == 0 ||
1983 !zone_allows_reclaim(preferred_zone, zone)) 1978 !zone_allows_reclaim(preferred_zone, zone))
1984 goto this_zone_full; 1979 goto this_zone_full;
1985 1980
1986 /* 1981 /*
1987 * As we may have just activated ZLC, check if the first 1982 * As we may have just activated ZLC, check if the first
1988 * eligible zone has failed zone_reclaim recently. 1983 * eligible zone has failed zone_reclaim recently.
1989 */ 1984 */
1990 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1985 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1991 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1986 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1992 continue; 1987 continue;
1993 1988
1994 ret = zone_reclaim(zone, gfp_mask, order); 1989 ret = zone_reclaim(zone, gfp_mask, order);
1995 switch (ret) { 1990 switch (ret) {
1996 case ZONE_RECLAIM_NOSCAN: 1991 case ZONE_RECLAIM_NOSCAN:
1997 /* did not scan */ 1992 /* did not scan */
1998 continue; 1993 continue;
1999 case ZONE_RECLAIM_FULL: 1994 case ZONE_RECLAIM_FULL:
2000 /* scanned but unreclaimable */ 1995 /* scanned but unreclaimable */
2001 continue; 1996 continue;
2002 default: 1997 default:
2003 /* did we reclaim enough */ 1998 /* did we reclaim enough */
2004 if (zone_watermark_ok(zone, order, mark, 1999 if (zone_watermark_ok(zone, order, mark,
2005 classzone_idx, alloc_flags)) 2000 classzone_idx, alloc_flags))
2006 goto try_this_zone; 2001 goto try_this_zone;
2007 2002
2008 /* 2003 /*
2009 * Failed to reclaim enough to meet watermark. 2004 * Failed to reclaim enough to meet watermark.
2010 * Only mark the zone full if checking the min 2005 * Only mark the zone full if checking the min
2011 * watermark or if we failed to reclaim just 2006 * watermark or if we failed to reclaim just
2012 * 1<<order pages or else the page allocator 2007 * 1<<order pages or else the page allocator
2013 * fastpath will prematurely mark zones full 2008 * fastpath will prematurely mark zones full
2014 * when the watermark is between the low and 2009 * when the watermark is between the low and
2015 * min watermarks. 2010 * min watermarks.
2016 */ 2011 */
2017 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2012 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2018 ret == ZONE_RECLAIM_SOME) 2013 ret == ZONE_RECLAIM_SOME)
2019 goto this_zone_full; 2014 goto this_zone_full;
2020 2015
2021 continue; 2016 continue;
2022 } 2017 }
2023 } 2018 }
2024 2019
2025 try_this_zone: 2020 try_this_zone:
2026 page = buffered_rmqueue(preferred_zone, zone, order, 2021 page = buffered_rmqueue(preferred_zone, zone, order,
2027 gfp_mask, migratetype); 2022 gfp_mask, migratetype);
2028 if (page) 2023 if (page)
2029 break; 2024 break;
2030 this_zone_full: 2025 this_zone_full:
2031 if (IS_ENABLED(CONFIG_NUMA)) 2026 if (IS_ENABLED(CONFIG_NUMA))
2032 zlc_mark_zone_full(zonelist, z); 2027 zlc_mark_zone_full(zonelist, z);
2033 } 2028 }
2034 2029
2035 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2030 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2036 /* Disable zlc cache for second zonelist scan */ 2031 /* Disable zlc cache for second zonelist scan */
2037 zlc_active = 0; 2032 zlc_active = 0;
2038 goto zonelist_scan; 2033 goto zonelist_scan;
2039 } 2034 }
2040 2035
2041 if (page) 2036 if (page)
2042 /* 2037 /*
2043 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2038 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2044 * necessary to allocate the page. The expectation is 2039 * necessary to allocate the page. The expectation is
2045 * that the caller is taking steps that will free more 2040 * that the caller is taking steps that will free more
2046 * memory. The caller should avoid the page being used 2041 * memory. The caller should avoid the page being used
2047 * for !PFMEMALLOC purposes. 2042 * for !PFMEMALLOC purposes.
2048 */ 2043 */
2049 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2044 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2050 2045
2051 return page; 2046 return page;
2052 } 2047 }
2053 2048
2054 /* 2049 /*
2055 * Large machines with many possible nodes should not always dump per-node 2050 * Large machines with many possible nodes should not always dump per-node
2056 * meminfo in irq context. 2051 * meminfo in irq context.
2057 */ 2052 */
2058 static inline bool should_suppress_show_mem(void) 2053 static inline bool should_suppress_show_mem(void)
2059 { 2054 {
2060 bool ret = false; 2055 bool ret = false;
2061 2056
2062 #if NODES_SHIFT > 8 2057 #if NODES_SHIFT > 8
2063 ret = in_interrupt(); 2058 ret = in_interrupt();
2064 #endif 2059 #endif
2065 return ret; 2060 return ret;
2066 } 2061 }
2067 2062
2068 static DEFINE_RATELIMIT_STATE(nopage_rs, 2063 static DEFINE_RATELIMIT_STATE(nopage_rs,
2069 DEFAULT_RATELIMIT_INTERVAL, 2064 DEFAULT_RATELIMIT_INTERVAL,
2070 DEFAULT_RATELIMIT_BURST); 2065 DEFAULT_RATELIMIT_BURST);
2071 2066
2072 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2067 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2073 { 2068 {
2074 unsigned int filter = SHOW_MEM_FILTER_NODES; 2069 unsigned int filter = SHOW_MEM_FILTER_NODES;
2075 2070
2076 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2071 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2077 debug_guardpage_minorder() > 0) 2072 debug_guardpage_minorder() > 0)
2078 return; 2073 return;
2079 2074
2080 /* 2075 /*
2081 * Walking all memory to count page types is very expensive and should 2076 * Walking all memory to count page types is very expensive and should
2082 * be inhibited in non-blockable contexts. 2077 * be inhibited in non-blockable contexts.
2083 */ 2078 */
2084 if (!(gfp_mask & __GFP_WAIT)) 2079 if (!(gfp_mask & __GFP_WAIT))
2085 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2080 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2086 2081
2087 /* 2082 /*
2088 * This documents exceptions given to allocations in certain 2083 * This documents exceptions given to allocations in certain
2089 * contexts that are allowed to allocate outside current's set 2084 * contexts that are allowed to allocate outside current's set
2090 * of allowed nodes. 2085 * of allowed nodes.
2091 */ 2086 */
2092 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2087 if (!(gfp_mask & __GFP_NOMEMALLOC))
2093 if (test_thread_flag(TIF_MEMDIE) || 2088 if (test_thread_flag(TIF_MEMDIE) ||
2094 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2089 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2095 filter &= ~SHOW_MEM_FILTER_NODES; 2090 filter &= ~SHOW_MEM_FILTER_NODES;
2096 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2091 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2097 filter &= ~SHOW_MEM_FILTER_NODES; 2092 filter &= ~SHOW_MEM_FILTER_NODES;
2098 2093
2099 if (fmt) { 2094 if (fmt) {
2100 struct va_format vaf; 2095 struct va_format vaf;
2101 va_list args; 2096 va_list args;
2102 2097
2103 va_start(args, fmt); 2098 va_start(args, fmt);
2104 2099
2105 vaf.fmt = fmt; 2100 vaf.fmt = fmt;
2106 vaf.va = &args; 2101 vaf.va = &args;
2107 2102
2108 pr_warn("%pV", &vaf); 2103 pr_warn("%pV", &vaf);
2109 2104
2110 va_end(args); 2105 va_end(args);
2111 } 2106 }
2112 2107
2113 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2108 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2114 current->comm, order, gfp_mask); 2109 current->comm, order, gfp_mask);
2115 2110
2116 dump_stack(); 2111 dump_stack();
2117 if (!should_suppress_show_mem()) 2112 if (!should_suppress_show_mem())
2118 show_mem(filter); 2113 show_mem(filter);
2119 } 2114 }
2120 2115
2121 static inline int 2116 static inline int
2122 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2117 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2123 unsigned long did_some_progress, 2118 unsigned long did_some_progress,
2124 unsigned long pages_reclaimed) 2119 unsigned long pages_reclaimed)
2125 { 2120 {
2126 /* Do not loop if specifically requested */ 2121 /* Do not loop if specifically requested */
2127 if (gfp_mask & __GFP_NORETRY) 2122 if (gfp_mask & __GFP_NORETRY)
2128 return 0; 2123 return 0;
2129 2124
2130 /* Always retry if specifically requested */ 2125 /* Always retry if specifically requested */
2131 if (gfp_mask & __GFP_NOFAIL) 2126 if (gfp_mask & __GFP_NOFAIL)
2132 return 1; 2127 return 1;
2133 2128
2134 /* 2129 /*
2135 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2130 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2136 * making forward progress without invoking OOM. Suspend also disables 2131 * making forward progress without invoking OOM. Suspend also disables
2137 * storage devices so kswapd will not help. Bail if we are suspending. 2132 * storage devices so kswapd will not help. Bail if we are suspending.
2138 */ 2133 */
2139 if (!did_some_progress && pm_suspended_storage()) 2134 if (!did_some_progress && pm_suspended_storage())
2140 return 0; 2135 return 0;
2141 2136
2142 /* 2137 /*
2143 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2138 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2144 * means __GFP_NOFAIL, but that may not be true in other 2139 * means __GFP_NOFAIL, but that may not be true in other
2145 * implementations. 2140 * implementations.
2146 */ 2141 */
2147 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2142 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2148 return 1; 2143 return 1;
2149 2144
2150 /* 2145 /*
2151 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2146 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2152 * specified, then we retry until we no longer reclaim any pages 2147 * specified, then we retry until we no longer reclaim any pages
2153 * (above), or we've reclaimed an order of pages at least as 2148 * (above), or we've reclaimed an order of pages at least as
2154 * large as the allocation's order. In both cases, if the 2149 * large as the allocation's order. In both cases, if the
2155 * allocation still fails, we stop retrying. 2150 * allocation still fails, we stop retrying.
2156 */ 2151 */
2157 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2152 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2158 return 1; 2153 return 1;
2159 2154
2160 return 0; 2155 return 0;
2161 } 2156 }
2162 2157
2163 static inline struct page * 2158 static inline struct page *
2164 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2159 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2165 struct zonelist *zonelist, enum zone_type high_zoneidx, 2160 struct zonelist *zonelist, enum zone_type high_zoneidx,
2166 nodemask_t *nodemask, struct zone *preferred_zone, 2161 nodemask_t *nodemask, struct zone *preferred_zone,
2167 int migratetype) 2162 int migratetype)
2168 { 2163 {
2169 struct page *page; 2164 struct page *page;
2170 2165
2171 /* Acquire the OOM killer lock for the zones in zonelist */ 2166 /* Acquire the OOM killer lock for the zones in zonelist */
2172 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2167 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2173 schedule_timeout_uninterruptible(1); 2168 schedule_timeout_uninterruptible(1);
2174 return NULL; 2169 return NULL;
2175 } 2170 }
2176 2171
2177 /* 2172 /*
2178 * Go through the zonelist yet one more time, keep very high watermark 2173 * Go through the zonelist yet one more time, keep very high watermark
2179 * here, this is only to catch a parallel oom killing, we must fail if 2174 * here, this is only to catch a parallel oom killing, we must fail if
2180 * we're still under heavy pressure. 2175 * we're still under heavy pressure.
2181 */ 2176 */
2182 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2177 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2183 order, zonelist, high_zoneidx, 2178 order, zonelist, high_zoneidx,
2184 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2179 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2185 preferred_zone, migratetype); 2180 preferred_zone, migratetype);
2186 if (page) 2181 if (page)
2187 goto out; 2182 goto out;
2188 2183
2189 if (!(gfp_mask & __GFP_NOFAIL)) { 2184 if (!(gfp_mask & __GFP_NOFAIL)) {
2190 /* The OOM killer will not help higher order allocs */ 2185 /* The OOM killer will not help higher order allocs */
2191 if (order > PAGE_ALLOC_COSTLY_ORDER) 2186 if (order > PAGE_ALLOC_COSTLY_ORDER)
2192 goto out; 2187 goto out;
2193 /* The OOM killer does not needlessly kill tasks for lowmem */ 2188 /* The OOM killer does not needlessly kill tasks for lowmem */
2194 if (high_zoneidx < ZONE_NORMAL) 2189 if (high_zoneidx < ZONE_NORMAL)
2195 goto out; 2190 goto out;
2196 /* 2191 /*
2197 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2192 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2198 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2193 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2199 * The caller should handle page allocation failure by itself if 2194 * The caller should handle page allocation failure by itself if
2200 * it specifies __GFP_THISNODE. 2195 * it specifies __GFP_THISNODE.
2201 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2196 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2202 */ 2197 */
2203 if (gfp_mask & __GFP_THISNODE) 2198 if (gfp_mask & __GFP_THISNODE)
2204 goto out; 2199 goto out;
2205 } 2200 }
2206 /* Exhausted what can be done so it's blamo time */ 2201 /* Exhausted what can be done so it's blamo time */
2207 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2202 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2208 2203
2209 out: 2204 out:
2210 clear_zonelist_oom(zonelist, gfp_mask); 2205 clear_zonelist_oom(zonelist, gfp_mask);
2211 return page; 2206 return page;
2212 } 2207 }
2213 2208
2214 #ifdef CONFIG_COMPACTION 2209 #ifdef CONFIG_COMPACTION
2215 /* Try memory compaction for high-order allocations before reclaim */ 2210 /* Try memory compaction for high-order allocations before reclaim */
2216 static struct page * 2211 static struct page *
2217 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2212 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2218 struct zonelist *zonelist, enum zone_type high_zoneidx, 2213 struct zonelist *zonelist, enum zone_type high_zoneidx,
2219 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2214 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2220 int migratetype, bool sync_migration, 2215 int migratetype, bool sync_migration,
2221 bool *contended_compaction, bool *deferred_compaction, 2216 bool *contended_compaction, bool *deferred_compaction,
2222 unsigned long *did_some_progress) 2217 unsigned long *did_some_progress)
2223 { 2218 {
2224 if (!order) 2219 if (!order)
2225 return NULL; 2220 return NULL;
2226 2221
2227 if (compaction_deferred(preferred_zone, order)) { 2222 if (compaction_deferred(preferred_zone, order)) {
2228 *deferred_compaction = true; 2223 *deferred_compaction = true;
2229 return NULL; 2224 return NULL;
2230 } 2225 }
2231 2226
2232 current->flags |= PF_MEMALLOC; 2227 current->flags |= PF_MEMALLOC;
2233 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2228 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2234 nodemask, sync_migration, 2229 nodemask, sync_migration,
2235 contended_compaction); 2230 contended_compaction);
2236 current->flags &= ~PF_MEMALLOC; 2231 current->flags &= ~PF_MEMALLOC;
2237 2232
2238 if (*did_some_progress != COMPACT_SKIPPED) { 2233 if (*did_some_progress != COMPACT_SKIPPED) {
2239 struct page *page; 2234 struct page *page;
2240 2235
2241 /* Page migration frees to the PCP lists but we want merging */ 2236 /* Page migration frees to the PCP lists but we want merging */
2242 drain_pages(get_cpu()); 2237 drain_pages(get_cpu());
2243 put_cpu(); 2238 put_cpu();
2244 2239
2245 page = get_page_from_freelist(gfp_mask, nodemask, 2240 page = get_page_from_freelist(gfp_mask, nodemask,
2246 order, zonelist, high_zoneidx, 2241 order, zonelist, high_zoneidx,
2247 alloc_flags & ~ALLOC_NO_WATERMARKS, 2242 alloc_flags & ~ALLOC_NO_WATERMARKS,
2248 preferred_zone, migratetype); 2243 preferred_zone, migratetype);
2249 if (page) { 2244 if (page) {
2250 preferred_zone->compact_blockskip_flush = false; 2245 preferred_zone->compact_blockskip_flush = false;
2251 preferred_zone->compact_considered = 0; 2246 preferred_zone->compact_considered = 0;
2252 preferred_zone->compact_defer_shift = 0; 2247 preferred_zone->compact_defer_shift = 0;
2253 if (order >= preferred_zone->compact_order_failed) 2248 if (order >= preferred_zone->compact_order_failed)
2254 preferred_zone->compact_order_failed = order + 1; 2249 preferred_zone->compact_order_failed = order + 1;
2255 count_vm_event(COMPACTSUCCESS); 2250 count_vm_event(COMPACTSUCCESS);
2256 return page; 2251 return page;
2257 } 2252 }
2258 2253
2259 /* 2254 /*
2260 * It's bad if compaction run occurs and fails. 2255 * It's bad if compaction run occurs and fails.
2261 * The most likely reason is that pages exist, 2256 * The most likely reason is that pages exist,
2262 * but not enough to satisfy watermarks. 2257 * but not enough to satisfy watermarks.
2263 */ 2258 */
2264 count_vm_event(COMPACTFAIL); 2259 count_vm_event(COMPACTFAIL);
2265 2260
2266 /* 2261 /*
2267 * As async compaction considers a subset of pageblocks, only 2262 * As async compaction considers a subset of pageblocks, only
2268 * defer if the failure was a sync compaction failure. 2263 * defer if the failure was a sync compaction failure.
2269 */ 2264 */
2270 if (sync_migration) 2265 if (sync_migration)
2271 defer_compaction(preferred_zone, order); 2266 defer_compaction(preferred_zone, order);
2272 2267
2273 cond_resched(); 2268 cond_resched();
2274 } 2269 }
2275 2270
2276 return NULL; 2271 return NULL;
2277 } 2272 }
2278 #else 2273 #else
2279 static inline struct page * 2274 static inline struct page *
2280 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2275 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2281 struct zonelist *zonelist, enum zone_type high_zoneidx, 2276 struct zonelist *zonelist, enum zone_type high_zoneidx,
2282 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2277 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2283 int migratetype, bool sync_migration, 2278 int migratetype, bool sync_migration,
2284 bool *contended_compaction, bool *deferred_compaction, 2279 bool *contended_compaction, bool *deferred_compaction,
2285 unsigned long *did_some_progress) 2280 unsigned long *did_some_progress)
2286 { 2281 {
2287 return NULL; 2282 return NULL;
2288 } 2283 }
2289 #endif /* CONFIG_COMPACTION */ 2284 #endif /* CONFIG_COMPACTION */
2290 2285
2291 /* Perform direct synchronous page reclaim */ 2286 /* Perform direct synchronous page reclaim */
2292 static int 2287 static int
2293 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2288 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2294 nodemask_t *nodemask) 2289 nodemask_t *nodemask)
2295 { 2290 {
2296 struct reclaim_state reclaim_state; 2291 struct reclaim_state reclaim_state;
2297 int progress; 2292 int progress;
2298 2293
2299 cond_resched(); 2294 cond_resched();
2300 2295
2301 /* We now go into synchronous reclaim */ 2296 /* We now go into synchronous reclaim */
2302 cpuset_memory_pressure_bump(); 2297 cpuset_memory_pressure_bump();
2303 current->flags |= PF_MEMALLOC; 2298 current->flags |= PF_MEMALLOC;
2304 lockdep_set_current_reclaim_state(gfp_mask); 2299 lockdep_set_current_reclaim_state(gfp_mask);
2305 reclaim_state.reclaimed_slab = 0; 2300 reclaim_state.reclaimed_slab = 0;
2306 current->reclaim_state = &reclaim_state; 2301 current->reclaim_state = &reclaim_state;
2307 2302
2308 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2303 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2309 2304
2310 current->reclaim_state = NULL; 2305 current->reclaim_state = NULL;
2311 lockdep_clear_current_reclaim_state(); 2306 lockdep_clear_current_reclaim_state();
2312 current->flags &= ~PF_MEMALLOC; 2307 current->flags &= ~PF_MEMALLOC;
2313 2308
2314 cond_resched(); 2309 cond_resched();
2315 2310
2316 return progress; 2311 return progress;
2317 } 2312 }
2318 2313
2319 /* The really slow allocator path where we enter direct reclaim */ 2314 /* The really slow allocator path where we enter direct reclaim */
2320 static inline struct page * 2315 static inline struct page *
2321 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2316 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2322 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
2323 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2318 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2324 int migratetype, unsigned long *did_some_progress) 2319 int migratetype, unsigned long *did_some_progress)
2325 { 2320 {
2326 struct page *page = NULL; 2321 struct page *page = NULL;
2327 bool drained = false; 2322 bool drained = false;
2328 2323
2329 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2324 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2330 nodemask); 2325 nodemask);
2331 if (unlikely(!(*did_some_progress))) 2326 if (unlikely(!(*did_some_progress)))
2332 return NULL; 2327 return NULL;
2333 2328
2334 /* After successful reclaim, reconsider all zones for allocation */ 2329 /* After successful reclaim, reconsider all zones for allocation */
2335 if (IS_ENABLED(CONFIG_NUMA)) 2330 if (IS_ENABLED(CONFIG_NUMA))
2336 zlc_clear_zones_full(zonelist); 2331 zlc_clear_zones_full(zonelist);
2337 2332
2338 retry: 2333 retry:
2339 page = get_page_from_freelist(gfp_mask, nodemask, order, 2334 page = get_page_from_freelist(gfp_mask, nodemask, order,
2340 zonelist, high_zoneidx, 2335 zonelist, high_zoneidx,
2341 alloc_flags & ~ALLOC_NO_WATERMARKS, 2336 alloc_flags & ~ALLOC_NO_WATERMARKS,
2342 preferred_zone, migratetype); 2337 preferred_zone, migratetype);
2343 2338
2344 /* 2339 /*
2345 * If an allocation failed after direct reclaim, it could be because 2340 * If an allocation failed after direct reclaim, it could be because
2346 * pages are pinned on the per-cpu lists. Drain them and try again 2341 * pages are pinned on the per-cpu lists. Drain them and try again
2347 */ 2342 */
2348 if (!page && !drained) { 2343 if (!page && !drained) {
2349 drain_all_pages(); 2344 drain_all_pages();
2350 drained = true; 2345 drained = true;
2351 goto retry; 2346 goto retry;
2352 } 2347 }
2353 2348
2354 return page; 2349 return page;
2355 } 2350 }
2356 2351
2357 /* 2352 /*
2358 * This is called in the allocator slow-path if the allocation request is of 2353 * This is called in the allocator slow-path if the allocation request is of
2359 * sufficient urgency to ignore watermarks and take other desperate measures 2354 * sufficient urgency to ignore watermarks and take other desperate measures
2360 */ 2355 */
2361 static inline struct page * 2356 static inline struct page *
2362 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2357 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2363 struct zonelist *zonelist, enum zone_type high_zoneidx, 2358 struct zonelist *zonelist, enum zone_type high_zoneidx,
2364 nodemask_t *nodemask, struct zone *preferred_zone, 2359 nodemask_t *nodemask, struct zone *preferred_zone,
2365 int migratetype) 2360 int migratetype)
2366 { 2361 {
2367 struct page *page; 2362 struct page *page;
2368 2363
2369 do { 2364 do {
2370 page = get_page_from_freelist(gfp_mask, nodemask, order, 2365 page = get_page_from_freelist(gfp_mask, nodemask, order,
2371 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2366 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2372 preferred_zone, migratetype); 2367 preferred_zone, migratetype);
2373 2368
2374 if (!page && gfp_mask & __GFP_NOFAIL) 2369 if (!page && gfp_mask & __GFP_NOFAIL)
2375 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2370 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2376 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2371 } while (!page && (gfp_mask & __GFP_NOFAIL));
2377 2372
2378 return page; 2373 return page;
2379 } 2374 }
2380 2375
2381 static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, 2376 static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
2382 struct zonelist *zonelist, 2377 struct zonelist *zonelist,
2383 enum zone_type high_zoneidx, 2378 enum zone_type high_zoneidx,
2384 struct zone *preferred_zone) 2379 struct zone *preferred_zone)
2385 { 2380 {
2386 struct zoneref *z; 2381 struct zoneref *z;
2387 struct zone *zone; 2382 struct zone *zone;
2388 2383
2389 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2384 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2390 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2385 if (!(gfp_mask & __GFP_NO_KSWAPD))
2391 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2386 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2392 /* 2387 /*
2393 * Only reset the batches of zones that were actually 2388 * Only reset the batches of zones that were actually
2394 * considered in the fast path, we don't want to 2389 * considered in the fast path, we don't want to
2395 * thrash fairness information for zones that are not 2390 * thrash fairness information for zones that are not
2396 * actually part of this zonelist's round-robin cycle. 2391 * actually part of this zonelist's round-robin cycle.
2397 */ 2392 */
2398 if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) 2393 if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
2399 continue; 2394 continue;
2400 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2395 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2401 high_wmark_pages(zone) - 2396 high_wmark_pages(zone) -
2402 low_wmark_pages(zone) - 2397 low_wmark_pages(zone) -
2403 zone_page_state(zone, NR_ALLOC_BATCH)); 2398 zone_page_state(zone, NR_ALLOC_BATCH));
2404 } 2399 }
2405 } 2400 }
2406 2401
2407 static inline int 2402 static inline int
2408 gfp_to_alloc_flags(gfp_t gfp_mask) 2403 gfp_to_alloc_flags(gfp_t gfp_mask)
2409 { 2404 {
2410 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2405 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2411 const gfp_t wait = gfp_mask & __GFP_WAIT; 2406 const gfp_t wait = gfp_mask & __GFP_WAIT;
2412 2407
2413 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2408 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2414 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2409 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2415 2410
2416 /* 2411 /*
2417 * The caller may dip into page reserves a bit more if the caller 2412 * The caller may dip into page reserves a bit more if the caller
2418 * cannot run direct reclaim, or if the caller has realtime scheduling 2413 * cannot run direct reclaim, or if the caller has realtime scheduling
2419 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2414 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2420 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2415 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
2421 */ 2416 */
2422 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2417 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2423 2418
2424 if (!wait) { 2419 if (!wait) {
2425 /* 2420 /*
2426 * Not worth trying to allocate harder for 2421 * Not worth trying to allocate harder for
2427 * __GFP_NOMEMALLOC even if it can't schedule. 2422 * __GFP_NOMEMALLOC even if it can't schedule.
2428 */ 2423 */
2429 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2424 if (!(gfp_mask & __GFP_NOMEMALLOC))
2430 alloc_flags |= ALLOC_HARDER; 2425 alloc_flags |= ALLOC_HARDER;
2431 /* 2426 /*
2432 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2427 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
2433 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2428 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
2434 */ 2429 */
2435 alloc_flags &= ~ALLOC_CPUSET; 2430 alloc_flags &= ~ALLOC_CPUSET;
2436 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2431 } else if (unlikely(rt_task(current)) && !in_interrupt())
2437 alloc_flags |= ALLOC_HARDER; 2432 alloc_flags |= ALLOC_HARDER;
2438 2433
2439 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2434 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2440 if (gfp_mask & __GFP_MEMALLOC) 2435 if (gfp_mask & __GFP_MEMALLOC)
2441 alloc_flags |= ALLOC_NO_WATERMARKS; 2436 alloc_flags |= ALLOC_NO_WATERMARKS;
2442 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2437 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2443 alloc_flags |= ALLOC_NO_WATERMARKS; 2438 alloc_flags |= ALLOC_NO_WATERMARKS;
2444 else if (!in_interrupt() && 2439 else if (!in_interrupt() &&
2445 ((current->flags & PF_MEMALLOC) || 2440 ((current->flags & PF_MEMALLOC) ||
2446 unlikely(test_thread_flag(TIF_MEMDIE)))) 2441 unlikely(test_thread_flag(TIF_MEMDIE))))
2447 alloc_flags |= ALLOC_NO_WATERMARKS; 2442 alloc_flags |= ALLOC_NO_WATERMARKS;
2448 } 2443 }
2449 #ifdef CONFIG_CMA 2444 #ifdef CONFIG_CMA
2450 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2445 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2451 alloc_flags |= ALLOC_CMA; 2446 alloc_flags |= ALLOC_CMA;
2452 #endif 2447 #endif
2453 return alloc_flags; 2448 return alloc_flags;
2454 } 2449 }
2455 2450
2456 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2451 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2457 { 2452 {
2458 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2453 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2459 } 2454 }
2460 2455
2461 static inline struct page * 2456 static inline struct page *
2462 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2457 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2463 struct zonelist *zonelist, enum zone_type high_zoneidx, 2458 struct zonelist *zonelist, enum zone_type high_zoneidx,
2464 nodemask_t *nodemask, struct zone *preferred_zone, 2459 nodemask_t *nodemask, struct zone *preferred_zone,
2465 int migratetype) 2460 int migratetype)
2466 { 2461 {
2467 const gfp_t wait = gfp_mask & __GFP_WAIT; 2462 const gfp_t wait = gfp_mask & __GFP_WAIT;
2468 struct page *page = NULL; 2463 struct page *page = NULL;
2469 int alloc_flags; 2464 int alloc_flags;
2470 unsigned long pages_reclaimed = 0; 2465 unsigned long pages_reclaimed = 0;
2471 unsigned long did_some_progress; 2466 unsigned long did_some_progress;
2472 bool sync_migration = false; 2467 bool sync_migration = false;
2473 bool deferred_compaction = false; 2468 bool deferred_compaction = false;
2474 bool contended_compaction = false; 2469 bool contended_compaction = false;
2475 2470
2476 /* 2471 /*
2477 * In the slowpath, we sanity check order to avoid ever trying to 2472 * In the slowpath, we sanity check order to avoid ever trying to
2478 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2473 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2479 * be using allocators in order of preference for an area that is 2474 * be using allocators in order of preference for an area that is
2480 * too large. 2475 * too large.
2481 */ 2476 */
2482 if (order >= MAX_ORDER) { 2477 if (order >= MAX_ORDER) {
2483 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2478 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2484 return NULL; 2479 return NULL;
2485 } 2480 }
2486 2481
2487 /* 2482 /*
2488 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2483 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2489 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2484 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2490 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2485 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2491 * using a larger set of nodes after it has established that the 2486 * using a larger set of nodes after it has established that the
2492 * allowed per node queues are empty and that nodes are 2487 * allowed per node queues are empty and that nodes are
2493 * over allocated. 2488 * over allocated.
2494 */ 2489 */
2495 if (IS_ENABLED(CONFIG_NUMA) && 2490 if (IS_ENABLED(CONFIG_NUMA) &&
2496 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2491 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2497 goto nopage; 2492 goto nopage;
2498 2493
2499 restart: 2494 restart:
2500 prepare_slowpath(gfp_mask, order, zonelist, 2495 prepare_slowpath(gfp_mask, order, zonelist,
2501 high_zoneidx, preferred_zone); 2496 high_zoneidx, preferred_zone);
2502 2497
2503 /* 2498 /*
2504 * OK, we're below the kswapd watermark and have kicked background 2499 * OK, we're below the kswapd watermark and have kicked background
2505 * reclaim. Now things get more complex, so set up alloc_flags according 2500 * reclaim. Now things get more complex, so set up alloc_flags according
2506 * to how we want to proceed. 2501 * to how we want to proceed.
2507 */ 2502 */
2508 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2503 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2509 2504
2510 /* 2505 /*
2511 * Find the true preferred zone if the allocation is unconstrained by 2506 * Find the true preferred zone if the allocation is unconstrained by
2512 * cpusets. 2507 * cpusets.
2513 */ 2508 */
2514 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2509 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2515 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2510 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2516 &preferred_zone); 2511 &preferred_zone);
2517 2512
2518 rebalance: 2513 rebalance:
2519 /* This is the last chance, in general, before the goto nopage. */ 2514 /* This is the last chance, in general, before the goto nopage. */
2520 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2515 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2521 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2516 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2522 preferred_zone, migratetype); 2517 preferred_zone, migratetype);
2523 if (page) 2518 if (page)
2524 goto got_pg; 2519 goto got_pg;
2525 2520
2526 /* Allocate without watermarks if the context allows */ 2521 /* Allocate without watermarks if the context allows */
2527 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2522 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2528 /* 2523 /*
2529 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2524 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2530 * the allocation is high priority and these type of 2525 * the allocation is high priority and these type of
2531 * allocations are system rather than user orientated 2526 * allocations are system rather than user orientated
2532 */ 2527 */
2533 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2528 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2534 2529
2535 page = __alloc_pages_high_priority(gfp_mask, order, 2530 page = __alloc_pages_high_priority(gfp_mask, order,
2536 zonelist, high_zoneidx, nodemask, 2531 zonelist, high_zoneidx, nodemask,
2537 preferred_zone, migratetype); 2532 preferred_zone, migratetype);
2538 if (page) { 2533 if (page) {
2539 goto got_pg; 2534 goto got_pg;
2540 } 2535 }
2541 } 2536 }
2542 2537
2543 /* Atomic allocations - we can't balance anything */ 2538 /* Atomic allocations - we can't balance anything */
2544 if (!wait) 2539 if (!wait)
2545 goto nopage; 2540 goto nopage;
2546 2541
2547 /* Avoid recursion of direct reclaim */ 2542 /* Avoid recursion of direct reclaim */
2548 if (current->flags & PF_MEMALLOC) 2543 if (current->flags & PF_MEMALLOC)
2549 goto nopage; 2544 goto nopage;
2550 2545
2551 /* Avoid allocations with no watermarks from looping endlessly */ 2546 /* Avoid allocations with no watermarks from looping endlessly */
2552 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2547 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2553 goto nopage; 2548 goto nopage;
2554 2549
2555 /* 2550 /*
2556 * Try direct compaction. The first pass is asynchronous. Subsequent 2551 * Try direct compaction. The first pass is asynchronous. Subsequent
2557 * attempts after direct reclaim are synchronous 2552 * attempts after direct reclaim are synchronous
2558 */ 2553 */
2559 page = __alloc_pages_direct_compact(gfp_mask, order, 2554 page = __alloc_pages_direct_compact(gfp_mask, order,
2560 zonelist, high_zoneidx, 2555 zonelist, high_zoneidx,
2561 nodemask, 2556 nodemask,
2562 alloc_flags, preferred_zone, 2557 alloc_flags, preferred_zone,
2563 migratetype, sync_migration, 2558 migratetype, sync_migration,
2564 &contended_compaction, 2559 &contended_compaction,
2565 &deferred_compaction, 2560 &deferred_compaction,
2566 &did_some_progress); 2561 &did_some_progress);
2567 if (page) 2562 if (page)
2568 goto got_pg; 2563 goto got_pg;
2569 sync_migration = true; 2564 sync_migration = true;
2570 2565
2571 /* 2566 /*
2572 * If compaction is deferred for high-order allocations, it is because 2567 * If compaction is deferred for high-order allocations, it is because
2573 * sync compaction recently failed. In this is the case and the caller 2568 * sync compaction recently failed. In this is the case and the caller
2574 * requested a movable allocation that does not heavily disrupt the 2569 * requested a movable allocation that does not heavily disrupt the
2575 * system then fail the allocation instead of entering direct reclaim. 2570 * system then fail the allocation instead of entering direct reclaim.
2576 */ 2571 */
2577 if ((deferred_compaction || contended_compaction) && 2572 if ((deferred_compaction || contended_compaction) &&
2578 (gfp_mask & __GFP_NO_KSWAPD)) 2573 (gfp_mask & __GFP_NO_KSWAPD))
2579 goto nopage; 2574 goto nopage;
2580 2575
2581 /* Try direct reclaim and then allocating */ 2576 /* Try direct reclaim and then allocating */
2582 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2577 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2583 zonelist, high_zoneidx, 2578 zonelist, high_zoneidx,
2584 nodemask, 2579 nodemask,
2585 alloc_flags, preferred_zone, 2580 alloc_flags, preferred_zone,
2586 migratetype, &did_some_progress); 2581 migratetype, &did_some_progress);
2587 if (page) 2582 if (page)
2588 goto got_pg; 2583 goto got_pg;
2589 2584
2590 /* 2585 /*
2591 * If we failed to make any progress reclaiming, then we are 2586 * If we failed to make any progress reclaiming, then we are
2592 * running out of options and have to consider going OOM 2587 * running out of options and have to consider going OOM
2593 */ 2588 */
2594 if (!did_some_progress) { 2589 if (!did_some_progress) {
2595 if (oom_gfp_allowed(gfp_mask)) { 2590 if (oom_gfp_allowed(gfp_mask)) {
2596 if (oom_killer_disabled) 2591 if (oom_killer_disabled)
2597 goto nopage; 2592 goto nopage;
2598 /* Coredumps can quickly deplete all memory reserves */ 2593 /* Coredumps can quickly deplete all memory reserves */
2599 if ((current->flags & PF_DUMPCORE) && 2594 if ((current->flags & PF_DUMPCORE) &&
2600 !(gfp_mask & __GFP_NOFAIL)) 2595 !(gfp_mask & __GFP_NOFAIL))
2601 goto nopage; 2596 goto nopage;
2602 page = __alloc_pages_may_oom(gfp_mask, order, 2597 page = __alloc_pages_may_oom(gfp_mask, order,
2603 zonelist, high_zoneidx, 2598 zonelist, high_zoneidx,
2604 nodemask, preferred_zone, 2599 nodemask, preferred_zone,
2605 migratetype); 2600 migratetype);
2606 if (page) 2601 if (page)
2607 goto got_pg; 2602 goto got_pg;
2608 2603
2609 if (!(gfp_mask & __GFP_NOFAIL)) { 2604 if (!(gfp_mask & __GFP_NOFAIL)) {
2610 /* 2605 /*
2611 * The oom killer is not called for high-order 2606 * The oom killer is not called for high-order
2612 * allocations that may fail, so if no progress 2607 * allocations that may fail, so if no progress
2613 * is being made, there are no other options and 2608 * is being made, there are no other options and
2614 * retrying is unlikely to help. 2609 * retrying is unlikely to help.
2615 */ 2610 */
2616 if (order > PAGE_ALLOC_COSTLY_ORDER) 2611 if (order > PAGE_ALLOC_COSTLY_ORDER)
2617 goto nopage; 2612 goto nopage;
2618 /* 2613 /*
2619 * The oom killer is not called for lowmem 2614 * The oom killer is not called for lowmem
2620 * allocations to prevent needlessly killing 2615 * allocations to prevent needlessly killing
2621 * innocent tasks. 2616 * innocent tasks.
2622 */ 2617 */
2623 if (high_zoneidx < ZONE_NORMAL) 2618 if (high_zoneidx < ZONE_NORMAL)
2624 goto nopage; 2619 goto nopage;
2625 } 2620 }
2626 2621
2627 goto restart; 2622 goto restart;
2628 } 2623 }
2629 } 2624 }
2630 2625
2631 /* Check if we should retry the allocation */ 2626 /* Check if we should retry the allocation */
2632 pages_reclaimed += did_some_progress; 2627 pages_reclaimed += did_some_progress;
2633 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2628 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2634 pages_reclaimed)) { 2629 pages_reclaimed)) {
2635 /* Wait for some write requests to complete then retry */ 2630 /* Wait for some write requests to complete then retry */
2636 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2631 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2637 goto rebalance; 2632 goto rebalance;
2638 } else { 2633 } else {
2639 /* 2634 /*
2640 * High-order allocations do not necessarily loop after 2635 * High-order allocations do not necessarily loop after
2641 * direct reclaim and reclaim/compaction depends on compaction 2636 * direct reclaim and reclaim/compaction depends on compaction
2642 * being called after reclaim so call directly if necessary 2637 * being called after reclaim so call directly if necessary
2643 */ 2638 */
2644 page = __alloc_pages_direct_compact(gfp_mask, order, 2639 page = __alloc_pages_direct_compact(gfp_mask, order,
2645 zonelist, high_zoneidx, 2640 zonelist, high_zoneidx,
2646 nodemask, 2641 nodemask,
2647 alloc_flags, preferred_zone, 2642 alloc_flags, preferred_zone,
2648 migratetype, sync_migration, 2643 migratetype, sync_migration,
2649 &contended_compaction, 2644 &contended_compaction,
2650 &deferred_compaction, 2645 &deferred_compaction,
2651 &did_some_progress); 2646 &did_some_progress);
2652 if (page) 2647 if (page)
2653 goto got_pg; 2648 goto got_pg;
2654 } 2649 }
2655 2650
2656 nopage: 2651 nopage:
2657 warn_alloc_failed(gfp_mask, order, NULL); 2652 warn_alloc_failed(gfp_mask, order, NULL);
2658 return page; 2653 return page;
2659 got_pg: 2654 got_pg:
2660 if (kmemcheck_enabled) 2655 if (kmemcheck_enabled)
2661 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2656 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2662 2657
2663 return page; 2658 return page;
2664 } 2659 }
2665 2660
2666 /* 2661 /*
2667 * This is the 'heart' of the zoned buddy allocator. 2662 * This is the 'heart' of the zoned buddy allocator.
2668 */ 2663 */
2669 struct page * 2664 struct page *
2670 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2665 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2671 struct zonelist *zonelist, nodemask_t *nodemask) 2666 struct zonelist *zonelist, nodemask_t *nodemask)
2672 { 2667 {
2673 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2668 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2674 struct zone *preferred_zone; 2669 struct zone *preferred_zone;
2675 struct page *page = NULL; 2670 struct page *page = NULL;
2676 int migratetype = allocflags_to_migratetype(gfp_mask); 2671 int migratetype = allocflags_to_migratetype(gfp_mask);
2677 unsigned int cpuset_mems_cookie; 2672 unsigned int cpuset_mems_cookie;
2678 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2673 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2679 struct mem_cgroup *memcg = NULL; 2674 struct mem_cgroup *memcg = NULL;
2680 2675
2681 gfp_mask &= gfp_allowed_mask; 2676 gfp_mask &= gfp_allowed_mask;
2682 2677
2683 lockdep_trace_alloc(gfp_mask); 2678 lockdep_trace_alloc(gfp_mask);
2684 2679
2685 might_sleep_if(gfp_mask & __GFP_WAIT); 2680 might_sleep_if(gfp_mask & __GFP_WAIT);
2686 2681
2687 if (should_fail_alloc_page(gfp_mask, order)) 2682 if (should_fail_alloc_page(gfp_mask, order))
2688 return NULL; 2683 return NULL;
2689 2684
2690 /* 2685 /*
2691 * Check the zones suitable for the gfp_mask contain at least one 2686 * Check the zones suitable for the gfp_mask contain at least one
2692 * valid zone. It's possible to have an empty zonelist as a result 2687 * valid zone. It's possible to have an empty zonelist as a result
2693 * of GFP_THISNODE and a memoryless node 2688 * of GFP_THISNODE and a memoryless node
2694 */ 2689 */
2695 if (unlikely(!zonelist->_zonerefs->zone)) 2690 if (unlikely(!zonelist->_zonerefs->zone))
2696 return NULL; 2691 return NULL;
2697 2692
2698 /* 2693 /*
2699 * Will only have any effect when __GFP_KMEMCG is set. This is 2694 * Will only have any effect when __GFP_KMEMCG is set. This is
2700 * verified in the (always inline) callee 2695 * verified in the (always inline) callee
2701 */ 2696 */
2702 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2697 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2703 return NULL; 2698 return NULL;
2704 2699
2705 retry_cpuset: 2700 retry_cpuset:
2706 cpuset_mems_cookie = get_mems_allowed(); 2701 cpuset_mems_cookie = get_mems_allowed();
2707 2702
2708 /* The preferred zone is used for statistics later */ 2703 /* The preferred zone is used for statistics later */
2709 first_zones_zonelist(zonelist, high_zoneidx, 2704 first_zones_zonelist(zonelist, high_zoneidx,
2710 nodemask ? : &cpuset_current_mems_allowed, 2705 nodemask ? : &cpuset_current_mems_allowed,
2711 &preferred_zone); 2706 &preferred_zone);
2712 if (!preferred_zone) 2707 if (!preferred_zone)
2713 goto out; 2708 goto out;
2714 2709
2715 #ifdef CONFIG_CMA 2710 #ifdef CONFIG_CMA
2716 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2711 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2717 alloc_flags |= ALLOC_CMA; 2712 alloc_flags |= ALLOC_CMA;
2718 #endif 2713 #endif
2719 /* First allocation attempt */ 2714 /* First allocation attempt */
2720 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2715 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2721 zonelist, high_zoneidx, alloc_flags, 2716 zonelist, high_zoneidx, alloc_flags,
2722 preferred_zone, migratetype); 2717 preferred_zone, migratetype);
2723 if (unlikely(!page)) { 2718 if (unlikely(!page)) {
2724 /* 2719 /*
2725 * Runtime PM, block IO and its error handling path 2720 * Runtime PM, block IO and its error handling path
2726 * can deadlock because I/O on the device might not 2721 * can deadlock because I/O on the device might not
2727 * complete. 2722 * complete.
2728 */ 2723 */
2729 gfp_mask = memalloc_noio_flags(gfp_mask); 2724 gfp_mask = memalloc_noio_flags(gfp_mask);
2730 page = __alloc_pages_slowpath(gfp_mask, order, 2725 page = __alloc_pages_slowpath(gfp_mask, order,
2731 zonelist, high_zoneidx, nodemask, 2726 zonelist, high_zoneidx, nodemask,
2732 preferred_zone, migratetype); 2727 preferred_zone, migratetype);
2733 } 2728 }
2734 2729
2735 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2730 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2736 2731
2737 out: 2732 out:
2738 /* 2733 /*
2739 * When updating a task's mems_allowed, it is possible to race with 2734 * When updating a task's mems_allowed, it is possible to race with
2740 * parallel threads in such a way that an allocation can fail while 2735 * parallel threads in such a way that an allocation can fail while
2741 * the mask is being updated. If a page allocation is about to fail, 2736 * the mask is being updated. If a page allocation is about to fail,
2742 * check if the cpuset changed during allocation and if so, retry. 2737 * check if the cpuset changed during allocation and if so, retry.
2743 */ 2738 */
2744 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2739 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2745 goto retry_cpuset; 2740 goto retry_cpuset;
2746 2741
2747 memcg_kmem_commit_charge(page, memcg, order); 2742 memcg_kmem_commit_charge(page, memcg, order);
2748 2743
2749 return page; 2744 return page;
2750 } 2745 }
2751 EXPORT_SYMBOL(__alloc_pages_nodemask); 2746 EXPORT_SYMBOL(__alloc_pages_nodemask);
2752 2747
2753 /* 2748 /*
2754 * Common helper functions. 2749 * Common helper functions.
2755 */ 2750 */
2756 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2751 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2757 { 2752 {
2758 struct page *page; 2753 struct page *page;
2759 2754
2760 /* 2755 /*
2761 * __get_free_pages() returns a 32-bit address, which cannot represent 2756 * __get_free_pages() returns a 32-bit address, which cannot represent
2762 * a highmem page 2757 * a highmem page
2763 */ 2758 */
2764 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2759 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2765 2760
2766 page = alloc_pages(gfp_mask, order); 2761 page = alloc_pages(gfp_mask, order);
2767 if (!page) 2762 if (!page)
2768 return 0; 2763 return 0;
2769 return (unsigned long) page_address(page); 2764 return (unsigned long) page_address(page);
2770 } 2765 }
2771 EXPORT_SYMBOL(__get_free_pages); 2766 EXPORT_SYMBOL(__get_free_pages);
2772 2767
2773 unsigned long get_zeroed_page(gfp_t gfp_mask) 2768 unsigned long get_zeroed_page(gfp_t gfp_mask)
2774 { 2769 {
2775 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2770 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2776 } 2771 }
2777 EXPORT_SYMBOL(get_zeroed_page); 2772 EXPORT_SYMBOL(get_zeroed_page);
2778 2773
2779 void __free_pages(struct page *page, unsigned int order) 2774 void __free_pages(struct page *page, unsigned int order)
2780 { 2775 {
2781 if (put_page_testzero(page)) { 2776 if (put_page_testzero(page)) {
2782 if (order == 0) 2777 if (order == 0)
2783 free_hot_cold_page(page, 0); 2778 free_hot_cold_page(page, 0);
2784 else 2779 else
2785 __free_pages_ok(page, order); 2780 __free_pages_ok(page, order);
2786 } 2781 }
2787 } 2782 }
2788 2783
2789 EXPORT_SYMBOL(__free_pages); 2784 EXPORT_SYMBOL(__free_pages);
2790 2785
2791 void free_pages(unsigned long addr, unsigned int order) 2786 void free_pages(unsigned long addr, unsigned int order)
2792 { 2787 {
2793 if (addr != 0) { 2788 if (addr != 0) {
2794 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2789 VM_BUG_ON(!virt_addr_valid((void *)addr));
2795 __free_pages(virt_to_page((void *)addr), order); 2790 __free_pages(virt_to_page((void *)addr), order);
2796 } 2791 }
2797 } 2792 }
2798 2793
2799 EXPORT_SYMBOL(free_pages); 2794 EXPORT_SYMBOL(free_pages);
2800 2795
2801 /* 2796 /*
2802 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2797 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2803 * pages allocated with __GFP_KMEMCG. 2798 * pages allocated with __GFP_KMEMCG.
2804 * 2799 *
2805 * Those pages are accounted to a particular memcg, embedded in the 2800 * Those pages are accounted to a particular memcg, embedded in the
2806 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2801 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2807 * for that information only to find out that it is NULL for users who have no 2802 * for that information only to find out that it is NULL for users who have no
2808 * interest in that whatsoever, we provide these functions. 2803 * interest in that whatsoever, we provide these functions.
2809 * 2804 *
2810 * The caller knows better which flags it relies on. 2805 * The caller knows better which flags it relies on.
2811 */ 2806 */
2812 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2807 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2813 { 2808 {
2814 memcg_kmem_uncharge_pages(page, order); 2809 memcg_kmem_uncharge_pages(page, order);
2815 __free_pages(page, order); 2810 __free_pages(page, order);
2816 } 2811 }
2817 2812
2818 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2813 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2819 { 2814 {
2820 if (addr != 0) { 2815 if (addr != 0) {
2821 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2816 VM_BUG_ON(!virt_addr_valid((void *)addr));
2822 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2817 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2823 } 2818 }
2824 } 2819 }
2825 2820
2826 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2821 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2827 { 2822 {
2828 if (addr) { 2823 if (addr) {
2829 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2824 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2830 unsigned long used = addr + PAGE_ALIGN(size); 2825 unsigned long used = addr + PAGE_ALIGN(size);
2831 2826
2832 split_page(virt_to_page((void *)addr), order); 2827 split_page(virt_to_page((void *)addr), order);
2833 while (used < alloc_end) { 2828 while (used < alloc_end) {
2834 free_page(used); 2829 free_page(used);
2835 used += PAGE_SIZE; 2830 used += PAGE_SIZE;
2836 } 2831 }
2837 } 2832 }
2838 return (void *)addr; 2833 return (void *)addr;
2839 } 2834 }
2840 2835
2841 /** 2836 /**
2842 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2837 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2843 * @size: the number of bytes to allocate 2838 * @size: the number of bytes to allocate
2844 * @gfp_mask: GFP flags for the allocation 2839 * @gfp_mask: GFP flags for the allocation
2845 * 2840 *
2846 * This function is similar to alloc_pages(), except that it allocates the 2841 * This function is similar to alloc_pages(), except that it allocates the
2847 * minimum number of pages to satisfy the request. alloc_pages() can only 2842 * minimum number of pages to satisfy the request. alloc_pages() can only
2848 * allocate memory in power-of-two pages. 2843 * allocate memory in power-of-two pages.
2849 * 2844 *
2850 * This function is also limited by MAX_ORDER. 2845 * This function is also limited by MAX_ORDER.
2851 * 2846 *
2852 * Memory allocated by this function must be released by free_pages_exact(). 2847 * Memory allocated by this function must be released by free_pages_exact().
2853 */ 2848 */
2854 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2849 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2855 { 2850 {
2856 unsigned int order = get_order(size); 2851 unsigned int order = get_order(size);
2857 unsigned long addr; 2852 unsigned long addr;
2858 2853
2859 addr = __get_free_pages(gfp_mask, order); 2854 addr = __get_free_pages(gfp_mask, order);
2860 return make_alloc_exact(addr, order, size); 2855 return make_alloc_exact(addr, order, size);
2861 } 2856 }
2862 EXPORT_SYMBOL(alloc_pages_exact); 2857 EXPORT_SYMBOL(alloc_pages_exact);
2863 2858
2864 /** 2859 /**
2865 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2860 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2866 * pages on a node. 2861 * pages on a node.
2867 * @nid: the preferred node ID where memory should be allocated 2862 * @nid: the preferred node ID where memory should be allocated
2868 * @size: the number of bytes to allocate 2863 * @size: the number of bytes to allocate
2869 * @gfp_mask: GFP flags for the allocation 2864 * @gfp_mask: GFP flags for the allocation
2870 * 2865 *
2871 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2866 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2872 * back. 2867 * back.
2873 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2868 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2874 * but is not exact. 2869 * but is not exact.
2875 */ 2870 */
2876 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2871 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2877 { 2872 {
2878 unsigned order = get_order(size); 2873 unsigned order = get_order(size);
2879 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2874 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2880 if (!p) 2875 if (!p)
2881 return NULL; 2876 return NULL;
2882 return make_alloc_exact((unsigned long)page_address(p), order, size); 2877 return make_alloc_exact((unsigned long)page_address(p), order, size);
2883 } 2878 }
2884 EXPORT_SYMBOL(alloc_pages_exact_nid); 2879 EXPORT_SYMBOL(alloc_pages_exact_nid);
2885 2880
2886 /** 2881 /**
2887 * free_pages_exact - release memory allocated via alloc_pages_exact() 2882 * free_pages_exact - release memory allocated via alloc_pages_exact()
2888 * @virt: the value returned by alloc_pages_exact. 2883 * @virt: the value returned by alloc_pages_exact.
2889 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2884 * @size: size of allocation, same value as passed to alloc_pages_exact().
2890 * 2885 *
2891 * Release the memory allocated by a previous call to alloc_pages_exact. 2886 * Release the memory allocated by a previous call to alloc_pages_exact.
2892 */ 2887 */
2893 void free_pages_exact(void *virt, size_t size) 2888 void free_pages_exact(void *virt, size_t size)
2894 { 2889 {
2895 unsigned long addr = (unsigned long)virt; 2890 unsigned long addr = (unsigned long)virt;
2896 unsigned long end = addr + PAGE_ALIGN(size); 2891 unsigned long end = addr + PAGE_ALIGN(size);
2897 2892
2898 while (addr < end) { 2893 while (addr < end) {
2899 free_page(addr); 2894 free_page(addr);
2900 addr += PAGE_SIZE; 2895 addr += PAGE_SIZE;
2901 } 2896 }
2902 } 2897 }
2903 EXPORT_SYMBOL(free_pages_exact); 2898 EXPORT_SYMBOL(free_pages_exact);
2904 2899
2905 /** 2900 /**
2906 * nr_free_zone_pages - count number of pages beyond high watermark 2901 * nr_free_zone_pages - count number of pages beyond high watermark
2907 * @offset: The zone index of the highest zone 2902 * @offset: The zone index of the highest zone
2908 * 2903 *
2909 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2904 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2910 * high watermark within all zones at or below a given zone index. For each 2905 * high watermark within all zones at or below a given zone index. For each
2911 * zone, the number of pages is calculated as: 2906 * zone, the number of pages is calculated as:
2912 * managed_pages - high_pages 2907 * managed_pages - high_pages
2913 */ 2908 */
2914 static unsigned long nr_free_zone_pages(int offset) 2909 static unsigned long nr_free_zone_pages(int offset)
2915 { 2910 {
2916 struct zoneref *z; 2911 struct zoneref *z;
2917 struct zone *zone; 2912 struct zone *zone;
2918 2913
2919 /* Just pick one node, since fallback list is circular */ 2914 /* Just pick one node, since fallback list is circular */
2920 unsigned long sum = 0; 2915 unsigned long sum = 0;
2921 2916
2922 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2917 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2923 2918
2924 for_each_zone_zonelist(zone, z, zonelist, offset) { 2919 for_each_zone_zonelist(zone, z, zonelist, offset) {
2925 unsigned long size = zone->managed_pages; 2920 unsigned long size = zone->managed_pages;
2926 unsigned long high = high_wmark_pages(zone); 2921 unsigned long high = high_wmark_pages(zone);
2927 if (size > high) 2922 if (size > high)
2928 sum += size - high; 2923 sum += size - high;
2929 } 2924 }
2930 2925
2931 return sum; 2926 return sum;
2932 } 2927 }
2933 2928
2934 /** 2929 /**
2935 * nr_free_buffer_pages - count number of pages beyond high watermark 2930 * nr_free_buffer_pages - count number of pages beyond high watermark
2936 * 2931 *
2937 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2932 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2938 * watermark within ZONE_DMA and ZONE_NORMAL. 2933 * watermark within ZONE_DMA and ZONE_NORMAL.
2939 */ 2934 */
2940 unsigned long nr_free_buffer_pages(void) 2935 unsigned long nr_free_buffer_pages(void)
2941 { 2936 {
2942 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2937 return nr_free_zone_pages(gfp_zone(GFP_USER));
2943 } 2938 }
2944 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2939 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2945 2940
2946 /** 2941 /**
2947 * nr_free_pagecache_pages - count number of pages beyond high watermark 2942 * nr_free_pagecache_pages - count number of pages beyond high watermark
2948 * 2943 *
2949 * nr_free_pagecache_pages() counts the number of pages which are beyond the 2944 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2950 * high watermark within all zones. 2945 * high watermark within all zones.
2951 */ 2946 */
2952 unsigned long nr_free_pagecache_pages(void) 2947 unsigned long nr_free_pagecache_pages(void)
2953 { 2948 {
2954 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2949 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2955 } 2950 }
2956 2951
2957 static inline void show_node(struct zone *zone) 2952 static inline void show_node(struct zone *zone)
2958 { 2953 {
2959 if (IS_ENABLED(CONFIG_NUMA)) 2954 if (IS_ENABLED(CONFIG_NUMA))
2960 printk("Node %d ", zone_to_nid(zone)); 2955 printk("Node %d ", zone_to_nid(zone));
2961 } 2956 }
2962 2957
2963 void si_meminfo(struct sysinfo *val) 2958 void si_meminfo(struct sysinfo *val)
2964 { 2959 {
2965 val->totalram = totalram_pages; 2960 val->totalram = totalram_pages;
2966 val->sharedram = 0; 2961 val->sharedram = 0;
2967 val->freeram = global_page_state(NR_FREE_PAGES); 2962 val->freeram = global_page_state(NR_FREE_PAGES);
2968 val->bufferram = nr_blockdev_pages(); 2963 val->bufferram = nr_blockdev_pages();
2969 val->totalhigh = totalhigh_pages; 2964 val->totalhigh = totalhigh_pages;
2970 val->freehigh = nr_free_highpages(); 2965 val->freehigh = nr_free_highpages();
2971 val->mem_unit = PAGE_SIZE; 2966 val->mem_unit = PAGE_SIZE;
2972 } 2967 }
2973 2968
2974 EXPORT_SYMBOL(si_meminfo); 2969 EXPORT_SYMBOL(si_meminfo);
2975 2970
2976 #ifdef CONFIG_NUMA 2971 #ifdef CONFIG_NUMA
2977 void si_meminfo_node(struct sysinfo *val, int nid) 2972 void si_meminfo_node(struct sysinfo *val, int nid)
2978 { 2973 {
2979 int zone_type; /* needs to be signed */ 2974 int zone_type; /* needs to be signed */
2980 unsigned long managed_pages = 0; 2975 unsigned long managed_pages = 0;
2981 pg_data_t *pgdat = NODE_DATA(nid); 2976 pg_data_t *pgdat = NODE_DATA(nid);
2982 2977
2983 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 2978 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2984 managed_pages += pgdat->node_zones[zone_type].managed_pages; 2979 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2985 val->totalram = managed_pages; 2980 val->totalram = managed_pages;
2986 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2981 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2987 #ifdef CONFIG_HIGHMEM 2982 #ifdef CONFIG_HIGHMEM
2988 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 2983 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
2989 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2984 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2990 NR_FREE_PAGES); 2985 NR_FREE_PAGES);
2991 #else 2986 #else
2992 val->totalhigh = 0; 2987 val->totalhigh = 0;
2993 val->freehigh = 0; 2988 val->freehigh = 0;
2994 #endif 2989 #endif
2995 val->mem_unit = PAGE_SIZE; 2990 val->mem_unit = PAGE_SIZE;
2996 } 2991 }
2997 #endif 2992 #endif
2998 2993
2999 /* 2994 /*
3000 * Determine whether the node should be displayed or not, depending on whether 2995 * Determine whether the node should be displayed or not, depending on whether
3001 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 2996 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3002 */ 2997 */
3003 bool skip_free_areas_node(unsigned int flags, int nid) 2998 bool skip_free_areas_node(unsigned int flags, int nid)
3004 { 2999 {
3005 bool ret = false; 3000 bool ret = false;
3006 unsigned int cpuset_mems_cookie; 3001 unsigned int cpuset_mems_cookie;
3007 3002
3008 if (!(flags & SHOW_MEM_FILTER_NODES)) 3003 if (!(flags & SHOW_MEM_FILTER_NODES))
3009 goto out; 3004 goto out;
3010 3005
3011 do { 3006 do {
3012 cpuset_mems_cookie = get_mems_allowed(); 3007 cpuset_mems_cookie = get_mems_allowed();
3013 ret = !node_isset(nid, cpuset_current_mems_allowed); 3008 ret = !node_isset(nid, cpuset_current_mems_allowed);
3014 } while (!put_mems_allowed(cpuset_mems_cookie)); 3009 } while (!put_mems_allowed(cpuset_mems_cookie));
3015 out: 3010 out:
3016 return ret; 3011 return ret;
3017 } 3012 }
3018 3013
3019 #define K(x) ((x) << (PAGE_SHIFT-10)) 3014 #define K(x) ((x) << (PAGE_SHIFT-10))
3020 3015
3021 static void show_migration_types(unsigned char type) 3016 static void show_migration_types(unsigned char type)
3022 { 3017 {
3023 static const char types[MIGRATE_TYPES] = { 3018 static const char types[MIGRATE_TYPES] = {
3024 [MIGRATE_UNMOVABLE] = 'U', 3019 [MIGRATE_UNMOVABLE] = 'U',
3025 [MIGRATE_RECLAIMABLE] = 'E', 3020 [MIGRATE_RECLAIMABLE] = 'E',
3026 [MIGRATE_MOVABLE] = 'M', 3021 [MIGRATE_MOVABLE] = 'M',
3027 [MIGRATE_RESERVE] = 'R', 3022 [MIGRATE_RESERVE] = 'R',
3028 #ifdef CONFIG_CMA 3023 #ifdef CONFIG_CMA
3029 [MIGRATE_CMA] = 'C', 3024 [MIGRATE_CMA] = 'C',
3030 #endif 3025 #endif
3031 #ifdef CONFIG_MEMORY_ISOLATION 3026 #ifdef CONFIG_MEMORY_ISOLATION
3032 [MIGRATE_ISOLATE] = 'I', 3027 [MIGRATE_ISOLATE] = 'I',
3033 #endif 3028 #endif
3034 }; 3029 };
3035 char tmp[MIGRATE_TYPES + 1]; 3030 char tmp[MIGRATE_TYPES + 1];
3036 char *p = tmp; 3031 char *p = tmp;
3037 int i; 3032 int i;
3038 3033
3039 for (i = 0; i < MIGRATE_TYPES; i++) { 3034 for (i = 0; i < MIGRATE_TYPES; i++) {
3040 if (type & (1 << i)) 3035 if (type & (1 << i))
3041 *p++ = types[i]; 3036 *p++ = types[i];
3042 } 3037 }
3043 3038
3044 *p = '\0'; 3039 *p = '\0';
3045 printk("(%s) ", tmp); 3040 printk("(%s) ", tmp);
3046 } 3041 }
3047 3042
3048 /* 3043 /*
3049 * Show free area list (used inside shift_scroll-lock stuff) 3044 * Show free area list (used inside shift_scroll-lock stuff)
3050 * We also calculate the percentage fragmentation. We do this by counting the 3045 * We also calculate the percentage fragmentation. We do this by counting the
3051 * memory on each free list with the exception of the first item on the list. 3046 * memory on each free list with the exception of the first item on the list.
3052 * Suppresses nodes that are not allowed by current's cpuset if 3047 * Suppresses nodes that are not allowed by current's cpuset if
3053 * SHOW_MEM_FILTER_NODES is passed. 3048 * SHOW_MEM_FILTER_NODES is passed.
3054 */ 3049 */
3055 void show_free_areas(unsigned int filter) 3050 void show_free_areas(unsigned int filter)
3056 { 3051 {
3057 int cpu; 3052 int cpu;
3058 struct zone *zone; 3053 struct zone *zone;
3059 3054
3060 for_each_populated_zone(zone) { 3055 for_each_populated_zone(zone) {
3061 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3056 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3062 continue; 3057 continue;
3063 show_node(zone); 3058 show_node(zone);
3064 printk("%s per-cpu:\n", zone->name); 3059 printk("%s per-cpu:\n", zone->name);
3065 3060
3066 for_each_online_cpu(cpu) { 3061 for_each_online_cpu(cpu) {
3067 struct per_cpu_pageset *pageset; 3062 struct per_cpu_pageset *pageset;
3068 3063
3069 pageset = per_cpu_ptr(zone->pageset, cpu); 3064 pageset = per_cpu_ptr(zone->pageset, cpu);
3070 3065
3071 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3066 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3072 cpu, pageset->pcp.high, 3067 cpu, pageset->pcp.high,
3073 pageset->pcp.batch, pageset->pcp.count); 3068 pageset->pcp.batch, pageset->pcp.count);
3074 } 3069 }
3075 } 3070 }
3076 3071
3077 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3072 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3078 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3073 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3079 " unevictable:%lu" 3074 " unevictable:%lu"
3080 " dirty:%lu writeback:%lu unstable:%lu\n" 3075 " dirty:%lu writeback:%lu unstable:%lu\n"
3081 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3076 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3082 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3077 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3083 " free_cma:%lu\n", 3078 " free_cma:%lu\n",
3084 global_page_state(NR_ACTIVE_ANON), 3079 global_page_state(NR_ACTIVE_ANON),
3085 global_page_state(NR_INACTIVE_ANON), 3080 global_page_state(NR_INACTIVE_ANON),
3086 global_page_state(NR_ISOLATED_ANON), 3081 global_page_state(NR_ISOLATED_ANON),
3087 global_page_state(NR_ACTIVE_FILE), 3082 global_page_state(NR_ACTIVE_FILE),
3088 global_page_state(NR_INACTIVE_FILE), 3083 global_page_state(NR_INACTIVE_FILE),
3089 global_page_state(NR_ISOLATED_FILE), 3084 global_page_state(NR_ISOLATED_FILE),
3090 global_page_state(NR_UNEVICTABLE), 3085 global_page_state(NR_UNEVICTABLE),
3091 global_page_state(NR_FILE_DIRTY), 3086 global_page_state(NR_FILE_DIRTY),
3092 global_page_state(NR_WRITEBACK), 3087 global_page_state(NR_WRITEBACK),
3093 global_page_state(NR_UNSTABLE_NFS), 3088 global_page_state(NR_UNSTABLE_NFS),
3094 global_page_state(NR_FREE_PAGES), 3089 global_page_state(NR_FREE_PAGES),
3095 global_page_state(NR_SLAB_RECLAIMABLE), 3090 global_page_state(NR_SLAB_RECLAIMABLE),
3096 global_page_state(NR_SLAB_UNRECLAIMABLE), 3091 global_page_state(NR_SLAB_UNRECLAIMABLE),
3097 global_page_state(NR_FILE_MAPPED), 3092 global_page_state(NR_FILE_MAPPED),
3098 global_page_state(NR_SHMEM), 3093 global_page_state(NR_SHMEM),
3099 global_page_state(NR_PAGETABLE), 3094 global_page_state(NR_PAGETABLE),
3100 global_page_state(NR_BOUNCE), 3095 global_page_state(NR_BOUNCE),
3101 global_page_state(NR_FREE_CMA_PAGES)); 3096 global_page_state(NR_FREE_CMA_PAGES));
3102 3097
3103 for_each_populated_zone(zone) { 3098 for_each_populated_zone(zone) {
3104 int i; 3099 int i;
3105 3100
3106 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3101 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3107 continue; 3102 continue;
3108 show_node(zone); 3103 show_node(zone);
3109 printk("%s" 3104 printk("%s"
3110 " free:%lukB" 3105 " free:%lukB"
3111 " min:%lukB" 3106 " min:%lukB"
3112 " low:%lukB" 3107 " low:%lukB"
3113 " high:%lukB" 3108 " high:%lukB"
3114 " active_anon:%lukB" 3109 " active_anon:%lukB"
3115 " inactive_anon:%lukB" 3110 " inactive_anon:%lukB"
3116 " active_file:%lukB" 3111 " active_file:%lukB"
3117 " inactive_file:%lukB" 3112 " inactive_file:%lukB"
3118 " unevictable:%lukB" 3113 " unevictable:%lukB"
3119 " isolated(anon):%lukB" 3114 " isolated(anon):%lukB"
3120 " isolated(file):%lukB" 3115 " isolated(file):%lukB"
3121 " present:%lukB" 3116 " present:%lukB"
3122 " managed:%lukB" 3117 " managed:%lukB"
3123 " mlocked:%lukB" 3118 " mlocked:%lukB"
3124 " dirty:%lukB" 3119 " dirty:%lukB"
3125 " writeback:%lukB" 3120 " writeback:%lukB"
3126 " mapped:%lukB" 3121 " mapped:%lukB"
3127 " shmem:%lukB" 3122 " shmem:%lukB"
3128 " slab_reclaimable:%lukB" 3123 " slab_reclaimable:%lukB"
3129 " slab_unreclaimable:%lukB" 3124 " slab_unreclaimable:%lukB"
3130 " kernel_stack:%lukB" 3125 " kernel_stack:%lukB"
3131 " pagetables:%lukB" 3126 " pagetables:%lukB"
3132 " unstable:%lukB" 3127 " unstable:%lukB"
3133 " bounce:%lukB" 3128 " bounce:%lukB"
3134 " free_cma:%lukB" 3129 " free_cma:%lukB"
3135 " writeback_tmp:%lukB" 3130 " writeback_tmp:%lukB"
3136 " pages_scanned:%lu" 3131 " pages_scanned:%lu"
3137 " all_unreclaimable? %s" 3132 " all_unreclaimable? %s"
3138 "\n", 3133 "\n",
3139 zone->name, 3134 zone->name,
3140 K(zone_page_state(zone, NR_FREE_PAGES)), 3135 K(zone_page_state(zone, NR_FREE_PAGES)),
3141 K(min_wmark_pages(zone)), 3136 K(min_wmark_pages(zone)),
3142 K(low_wmark_pages(zone)), 3137 K(low_wmark_pages(zone)),
3143 K(high_wmark_pages(zone)), 3138 K(high_wmark_pages(zone)),
3144 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3139 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3145 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3140 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3146 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3141 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3147 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3142 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3148 K(zone_page_state(zone, NR_UNEVICTABLE)), 3143 K(zone_page_state(zone, NR_UNEVICTABLE)),
3149 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3144 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3150 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3145 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3151 K(zone->present_pages), 3146 K(zone->present_pages),
3152 K(zone->managed_pages), 3147 K(zone->managed_pages),
3153 K(zone_page_state(zone, NR_MLOCK)), 3148 K(zone_page_state(zone, NR_MLOCK)),
3154 K(zone_page_state(zone, NR_FILE_DIRTY)), 3149 K(zone_page_state(zone, NR_FILE_DIRTY)),
3155 K(zone_page_state(zone, NR_WRITEBACK)), 3150 K(zone_page_state(zone, NR_WRITEBACK)),
3156 K(zone_page_state(zone, NR_FILE_MAPPED)), 3151 K(zone_page_state(zone, NR_FILE_MAPPED)),
3157 K(zone_page_state(zone, NR_SHMEM)), 3152 K(zone_page_state(zone, NR_SHMEM)),
3158 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3153 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3159 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3154 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3160 zone_page_state(zone, NR_KERNEL_STACK) * 3155 zone_page_state(zone, NR_KERNEL_STACK) *
3161 THREAD_SIZE / 1024, 3156 THREAD_SIZE / 1024,
3162 K(zone_page_state(zone, NR_PAGETABLE)), 3157 K(zone_page_state(zone, NR_PAGETABLE)),
3163 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3158 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3164 K(zone_page_state(zone, NR_BOUNCE)), 3159 K(zone_page_state(zone, NR_BOUNCE)),
3165 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3160 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3166 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3161 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3167 zone->pages_scanned, 3162 zone->pages_scanned,
3168 (!zone_reclaimable(zone) ? "yes" : "no") 3163 (!zone_reclaimable(zone) ? "yes" : "no")
3169 ); 3164 );
3170 printk("lowmem_reserve[]:"); 3165 printk("lowmem_reserve[]:");
3171 for (i = 0; i < MAX_NR_ZONES; i++) 3166 for (i = 0; i < MAX_NR_ZONES; i++)
3172 printk(" %lu", zone->lowmem_reserve[i]); 3167 printk(" %lu", zone->lowmem_reserve[i]);
3173 printk("\n"); 3168 printk("\n");
3174 } 3169 }
3175 3170
3176 for_each_populated_zone(zone) { 3171 for_each_populated_zone(zone) {
3177 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3172 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3178 unsigned char types[MAX_ORDER]; 3173 unsigned char types[MAX_ORDER];
3179 3174
3180 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3175 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3181 continue; 3176 continue;
3182 show_node(zone); 3177 show_node(zone);
3183 printk("%s: ", zone->name); 3178 printk("%s: ", zone->name);
3184 3179
3185 spin_lock_irqsave(&zone->lock, flags); 3180 spin_lock_irqsave(&zone->lock, flags);
3186 for (order = 0; order < MAX_ORDER; order++) { 3181 for (order = 0; order < MAX_ORDER; order++) {
3187 struct free_area *area = &zone->free_area[order]; 3182 struct free_area *area = &zone->free_area[order];
3188 int type; 3183 int type;
3189 3184
3190 nr[order] = area->nr_free; 3185 nr[order] = area->nr_free;
3191 total += nr[order] << order; 3186 total += nr[order] << order;
3192 3187
3193 types[order] = 0; 3188 types[order] = 0;
3194 for (type = 0; type < MIGRATE_TYPES; type++) { 3189 for (type = 0; type < MIGRATE_TYPES; type++) {
3195 if (!list_empty(&area->free_list[type])) 3190 if (!list_empty(&area->free_list[type]))
3196 types[order] |= 1 << type; 3191 types[order] |= 1 << type;
3197 } 3192 }
3198 } 3193 }
3199 spin_unlock_irqrestore(&zone->lock, flags); 3194 spin_unlock_irqrestore(&zone->lock, flags);
3200 for (order = 0; order < MAX_ORDER; order++) { 3195 for (order = 0; order < MAX_ORDER; order++) {
3201 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3196 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3202 if (nr[order]) 3197 if (nr[order])
3203 show_migration_types(types[order]); 3198 show_migration_types(types[order]);
3204 } 3199 }
3205 printk("= %lukB\n", K(total)); 3200 printk("= %lukB\n", K(total));
3206 } 3201 }
3207 3202
3208 hugetlb_show_meminfo(); 3203 hugetlb_show_meminfo();
3209 3204
3210 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3205 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3211 3206
3212 show_swap_cache_info(); 3207 show_swap_cache_info();
3213 } 3208 }
3214 3209
3215 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3210 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3216 { 3211 {
3217 zoneref->zone = zone; 3212 zoneref->zone = zone;
3218 zoneref->zone_idx = zone_idx(zone); 3213 zoneref->zone_idx = zone_idx(zone);
3219 } 3214 }
3220 3215
3221 /* 3216 /*
3222 * Builds allocation fallback zone lists. 3217 * Builds allocation fallback zone lists.
3223 * 3218 *
3224 * Add all populated zones of a node to the zonelist. 3219 * Add all populated zones of a node to the zonelist.
3225 */ 3220 */
3226 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3221 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3227 int nr_zones) 3222 int nr_zones)
3228 { 3223 {
3229 struct zone *zone; 3224 struct zone *zone;
3230 enum zone_type zone_type = MAX_NR_ZONES; 3225 enum zone_type zone_type = MAX_NR_ZONES;
3231 3226
3232 do { 3227 do {
3233 zone_type--; 3228 zone_type--;
3234 zone = pgdat->node_zones + zone_type; 3229 zone = pgdat->node_zones + zone_type;
3235 if (populated_zone(zone)) { 3230 if (populated_zone(zone)) {
3236 zoneref_set_zone(zone, 3231 zoneref_set_zone(zone,
3237 &zonelist->_zonerefs[nr_zones++]); 3232 &zonelist->_zonerefs[nr_zones++]);
3238 check_highest_zone(zone_type); 3233 check_highest_zone(zone_type);
3239 } 3234 }
3240 } while (zone_type); 3235 } while (zone_type);
3241 3236
3242 return nr_zones; 3237 return nr_zones;
3243 } 3238 }
3244 3239
3245 3240
3246 /* 3241 /*
3247 * zonelist_order: 3242 * zonelist_order:
3248 * 0 = automatic detection of better ordering. 3243 * 0 = automatic detection of better ordering.
3249 * 1 = order by ([node] distance, -zonetype) 3244 * 1 = order by ([node] distance, -zonetype)
3250 * 2 = order by (-zonetype, [node] distance) 3245 * 2 = order by (-zonetype, [node] distance)
3251 * 3246 *
3252 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3247 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3253 * the same zonelist. So only NUMA can configure this param. 3248 * the same zonelist. So only NUMA can configure this param.
3254 */ 3249 */
3255 #define ZONELIST_ORDER_DEFAULT 0 3250 #define ZONELIST_ORDER_DEFAULT 0
3256 #define ZONELIST_ORDER_NODE 1 3251 #define ZONELIST_ORDER_NODE 1
3257 #define ZONELIST_ORDER_ZONE 2 3252 #define ZONELIST_ORDER_ZONE 2
3258 3253
3259 /* zonelist order in the kernel. 3254 /* zonelist order in the kernel.
3260 * set_zonelist_order() will set this to NODE or ZONE. 3255 * set_zonelist_order() will set this to NODE or ZONE.
3261 */ 3256 */
3262 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3257 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3263 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3258 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3264 3259
3265 3260
3266 #ifdef CONFIG_NUMA 3261 #ifdef CONFIG_NUMA
3267 /* The value user specified ....changed by config */ 3262 /* The value user specified ....changed by config */
3268 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3263 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3269 /* string for sysctl */ 3264 /* string for sysctl */
3270 #define NUMA_ZONELIST_ORDER_LEN 16 3265 #define NUMA_ZONELIST_ORDER_LEN 16
3271 char numa_zonelist_order[16] = "default"; 3266 char numa_zonelist_order[16] = "default";
3272 3267
3273 /* 3268 /*
3274 * interface for configure zonelist ordering. 3269 * interface for configure zonelist ordering.
3275 * command line option "numa_zonelist_order" 3270 * command line option "numa_zonelist_order"
3276 * = "[dD]efault - default, automatic configuration. 3271 * = "[dD]efault - default, automatic configuration.
3277 * = "[nN]ode - order by node locality, then by zone within node 3272 * = "[nN]ode - order by node locality, then by zone within node
3278 * = "[zZ]one - order by zone, then by locality within zone 3273 * = "[zZ]one - order by zone, then by locality within zone
3279 */ 3274 */
3280 3275
3281 static int __parse_numa_zonelist_order(char *s) 3276 static int __parse_numa_zonelist_order(char *s)
3282 { 3277 {
3283 if (*s == 'd' || *s == 'D') { 3278 if (*s == 'd' || *s == 'D') {
3284 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3279 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3285 } else if (*s == 'n' || *s == 'N') { 3280 } else if (*s == 'n' || *s == 'N') {
3286 user_zonelist_order = ZONELIST_ORDER_NODE; 3281 user_zonelist_order = ZONELIST_ORDER_NODE;
3287 } else if (*s == 'z' || *s == 'Z') { 3282 } else if (*s == 'z' || *s == 'Z') {
3288 user_zonelist_order = ZONELIST_ORDER_ZONE; 3283 user_zonelist_order = ZONELIST_ORDER_ZONE;
3289 } else { 3284 } else {
3290 printk(KERN_WARNING 3285 printk(KERN_WARNING
3291 "Ignoring invalid numa_zonelist_order value: " 3286 "Ignoring invalid numa_zonelist_order value: "
3292 "%s\n", s); 3287 "%s\n", s);
3293 return -EINVAL; 3288 return -EINVAL;
3294 } 3289 }
3295 return 0; 3290 return 0;
3296 } 3291 }
3297 3292
3298 static __init int setup_numa_zonelist_order(char *s) 3293 static __init int setup_numa_zonelist_order(char *s)
3299 { 3294 {
3300 int ret; 3295 int ret;
3301 3296
3302 if (!s) 3297 if (!s)
3303 return 0; 3298 return 0;
3304 3299
3305 ret = __parse_numa_zonelist_order(s); 3300 ret = __parse_numa_zonelist_order(s);
3306 if (ret == 0) 3301 if (ret == 0)
3307 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3302 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3308 3303
3309 return ret; 3304 return ret;
3310 } 3305 }
3311 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3306 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3312 3307
3313 /* 3308 /*
3314 * sysctl handler for numa_zonelist_order 3309 * sysctl handler for numa_zonelist_order
3315 */ 3310 */
3316 int numa_zonelist_order_handler(ctl_table *table, int write, 3311 int numa_zonelist_order_handler(ctl_table *table, int write,
3317 void __user *buffer, size_t *length, 3312 void __user *buffer, size_t *length,
3318 loff_t *ppos) 3313 loff_t *ppos)
3319 { 3314 {
3320 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3315 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3321 int ret; 3316 int ret;
3322 static DEFINE_MUTEX(zl_order_mutex); 3317 static DEFINE_MUTEX(zl_order_mutex);
3323 3318
3324 mutex_lock(&zl_order_mutex); 3319 mutex_lock(&zl_order_mutex);
3325 if (write) { 3320 if (write) {
3326 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3321 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3327 ret = -EINVAL; 3322 ret = -EINVAL;
3328 goto out; 3323 goto out;
3329 } 3324 }
3330 strcpy(saved_string, (char *)table->data); 3325 strcpy(saved_string, (char *)table->data);
3331 } 3326 }
3332 ret = proc_dostring(table, write, buffer, length, ppos); 3327 ret = proc_dostring(table, write, buffer, length, ppos);
3333 if (ret) 3328 if (ret)
3334 goto out; 3329 goto out;
3335 if (write) { 3330 if (write) {
3336 int oldval = user_zonelist_order; 3331 int oldval = user_zonelist_order;
3337 3332
3338 ret = __parse_numa_zonelist_order((char *)table->data); 3333 ret = __parse_numa_zonelist_order((char *)table->data);
3339 if (ret) { 3334 if (ret) {
3340 /* 3335 /*
3341 * bogus value. restore saved string 3336 * bogus value. restore saved string
3342 */ 3337 */
3343 strncpy((char *)table->data, saved_string, 3338 strncpy((char *)table->data, saved_string,
3344 NUMA_ZONELIST_ORDER_LEN); 3339 NUMA_ZONELIST_ORDER_LEN);
3345 user_zonelist_order = oldval; 3340 user_zonelist_order = oldval;
3346 } else if (oldval != user_zonelist_order) { 3341 } else if (oldval != user_zonelist_order) {
3347 mutex_lock(&zonelists_mutex); 3342 mutex_lock(&zonelists_mutex);
3348 build_all_zonelists(NULL, NULL); 3343 build_all_zonelists(NULL, NULL);
3349 mutex_unlock(&zonelists_mutex); 3344 mutex_unlock(&zonelists_mutex);
3350 } 3345 }
3351 } 3346 }
3352 out: 3347 out:
3353 mutex_unlock(&zl_order_mutex); 3348 mutex_unlock(&zl_order_mutex);
3354 return ret; 3349 return ret;
3355 } 3350 }
3356 3351
3357 3352
3358 #define MAX_NODE_LOAD (nr_online_nodes) 3353 #define MAX_NODE_LOAD (nr_online_nodes)
3359 static int node_load[MAX_NUMNODES]; 3354 static int node_load[MAX_NUMNODES];
3360 3355
3361 /** 3356 /**
3362 * find_next_best_node - find the next node that should appear in a given node's fallback list 3357 * find_next_best_node - find the next node that should appear in a given node's fallback list
3363 * @node: node whose fallback list we're appending 3358 * @node: node whose fallback list we're appending
3364 * @used_node_mask: nodemask_t of already used nodes 3359 * @used_node_mask: nodemask_t of already used nodes
3365 * 3360 *
3366 * We use a number of factors to determine which is the next node that should 3361 * We use a number of factors to determine which is the next node that should
3367 * appear on a given node's fallback list. The node should not have appeared 3362 * appear on a given node's fallback list. The node should not have appeared
3368 * already in @node's fallback list, and it should be the next closest node 3363 * already in @node's fallback list, and it should be the next closest node
3369 * according to the distance array (which contains arbitrary distance values 3364 * according to the distance array (which contains arbitrary distance values
3370 * from each node to each node in the system), and should also prefer nodes 3365 * from each node to each node in the system), and should also prefer nodes
3371 * with no CPUs, since presumably they'll have very little allocation pressure 3366 * with no CPUs, since presumably they'll have very little allocation pressure
3372 * on them otherwise. 3367 * on them otherwise.
3373 * It returns -1 if no node is found. 3368 * It returns -1 if no node is found.
3374 */ 3369 */
3375 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3370 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3376 { 3371 {
3377 int n, val; 3372 int n, val;
3378 int min_val = INT_MAX; 3373 int min_val = INT_MAX;
3379 int best_node = NUMA_NO_NODE; 3374 int best_node = NUMA_NO_NODE;
3380 const struct cpumask *tmp = cpumask_of_node(0); 3375 const struct cpumask *tmp = cpumask_of_node(0);
3381 3376
3382 /* Use the local node if we haven't already */ 3377 /* Use the local node if we haven't already */
3383 if (!node_isset(node, *used_node_mask)) { 3378 if (!node_isset(node, *used_node_mask)) {
3384 node_set(node, *used_node_mask); 3379 node_set(node, *used_node_mask);
3385 return node; 3380 return node;
3386 } 3381 }
3387 3382
3388 for_each_node_state(n, N_MEMORY) { 3383 for_each_node_state(n, N_MEMORY) {
3389 3384
3390 /* Don't want a node to appear more than once */ 3385 /* Don't want a node to appear more than once */
3391 if (node_isset(n, *used_node_mask)) 3386 if (node_isset(n, *used_node_mask))
3392 continue; 3387 continue;
3393 3388
3394 /* Use the distance array to find the distance */ 3389 /* Use the distance array to find the distance */
3395 val = node_distance(node, n); 3390 val = node_distance(node, n);
3396 3391
3397 /* Penalize nodes under us ("prefer the next node") */ 3392 /* Penalize nodes under us ("prefer the next node") */
3398 val += (n < node); 3393 val += (n < node);
3399 3394
3400 /* Give preference to headless and unused nodes */ 3395 /* Give preference to headless and unused nodes */
3401 tmp = cpumask_of_node(n); 3396 tmp = cpumask_of_node(n);
3402 if (!cpumask_empty(tmp)) 3397 if (!cpumask_empty(tmp))
3403 val += PENALTY_FOR_NODE_WITH_CPUS; 3398 val += PENALTY_FOR_NODE_WITH_CPUS;
3404 3399
3405 /* Slight preference for less loaded node */ 3400 /* Slight preference for less loaded node */
3406 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3401 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3407 val += node_load[n]; 3402 val += node_load[n];
3408 3403
3409 if (val < min_val) { 3404 if (val < min_val) {
3410 min_val = val; 3405 min_val = val;
3411 best_node = n; 3406 best_node = n;
3412 } 3407 }
3413 } 3408 }
3414 3409
3415 if (best_node >= 0) 3410 if (best_node >= 0)
3416 node_set(best_node, *used_node_mask); 3411 node_set(best_node, *used_node_mask);
3417 3412
3418 return best_node; 3413 return best_node;
3419 } 3414 }
3420 3415
3421 3416
3422 /* 3417 /*
3423 * Build zonelists ordered by node and zones within node. 3418 * Build zonelists ordered by node and zones within node.
3424 * This results in maximum locality--normal zone overflows into local 3419 * This results in maximum locality--normal zone overflows into local
3425 * DMA zone, if any--but risks exhausting DMA zone. 3420 * DMA zone, if any--but risks exhausting DMA zone.
3426 */ 3421 */
3427 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3422 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3428 { 3423 {
3429 int j; 3424 int j;
3430 struct zonelist *zonelist; 3425 struct zonelist *zonelist;
3431 3426
3432 zonelist = &pgdat->node_zonelists[0]; 3427 zonelist = &pgdat->node_zonelists[0];
3433 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3428 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3434 ; 3429 ;
3435 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3430 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3436 zonelist->_zonerefs[j].zone = NULL; 3431 zonelist->_zonerefs[j].zone = NULL;
3437 zonelist->_zonerefs[j].zone_idx = 0; 3432 zonelist->_zonerefs[j].zone_idx = 0;
3438 } 3433 }
3439 3434
3440 /* 3435 /*
3441 * Build gfp_thisnode zonelists 3436 * Build gfp_thisnode zonelists
3442 */ 3437 */
3443 static void build_thisnode_zonelists(pg_data_t *pgdat) 3438 static void build_thisnode_zonelists(pg_data_t *pgdat)
3444 { 3439 {
3445 int j; 3440 int j;
3446 struct zonelist *zonelist; 3441 struct zonelist *zonelist;
3447 3442
3448 zonelist = &pgdat->node_zonelists[1]; 3443 zonelist = &pgdat->node_zonelists[1];
3449 j = build_zonelists_node(pgdat, zonelist, 0); 3444 j = build_zonelists_node(pgdat, zonelist, 0);
3450 zonelist->_zonerefs[j].zone = NULL; 3445 zonelist->_zonerefs[j].zone = NULL;
3451 zonelist->_zonerefs[j].zone_idx = 0; 3446 zonelist->_zonerefs[j].zone_idx = 0;
3452 } 3447 }
3453 3448
3454 /* 3449 /*
3455 * Build zonelists ordered by zone and nodes within zones. 3450 * Build zonelists ordered by zone and nodes within zones.
3456 * This results in conserving DMA zone[s] until all Normal memory is 3451 * This results in conserving DMA zone[s] until all Normal memory is
3457 * exhausted, but results in overflowing to remote node while memory 3452 * exhausted, but results in overflowing to remote node while memory
3458 * may still exist in local DMA zone. 3453 * may still exist in local DMA zone.
3459 */ 3454 */
3460 static int node_order[MAX_NUMNODES]; 3455 static int node_order[MAX_NUMNODES];
3461 3456
3462 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3457 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3463 { 3458 {
3464 int pos, j, node; 3459 int pos, j, node;
3465 int zone_type; /* needs to be signed */ 3460 int zone_type; /* needs to be signed */
3466 struct zone *z; 3461 struct zone *z;
3467 struct zonelist *zonelist; 3462 struct zonelist *zonelist;
3468 3463
3469 zonelist = &pgdat->node_zonelists[0]; 3464 zonelist = &pgdat->node_zonelists[0];
3470 pos = 0; 3465 pos = 0;
3471 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3466 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3472 for (j = 0; j < nr_nodes; j++) { 3467 for (j = 0; j < nr_nodes; j++) {
3473 node = node_order[j]; 3468 node = node_order[j];
3474 z = &NODE_DATA(node)->node_zones[zone_type]; 3469 z = &NODE_DATA(node)->node_zones[zone_type];
3475 if (populated_zone(z)) { 3470 if (populated_zone(z)) {
3476 zoneref_set_zone(z, 3471 zoneref_set_zone(z,
3477 &zonelist->_zonerefs[pos++]); 3472 &zonelist->_zonerefs[pos++]);
3478 check_highest_zone(zone_type); 3473 check_highest_zone(zone_type);
3479 } 3474 }
3480 } 3475 }
3481 } 3476 }
3482 zonelist->_zonerefs[pos].zone = NULL; 3477 zonelist->_zonerefs[pos].zone = NULL;
3483 zonelist->_zonerefs[pos].zone_idx = 0; 3478 zonelist->_zonerefs[pos].zone_idx = 0;
3484 } 3479 }
3485 3480
3486 static int default_zonelist_order(void) 3481 static int default_zonelist_order(void)
3487 { 3482 {
3488 int nid, zone_type; 3483 int nid, zone_type;
3489 unsigned long low_kmem_size, total_size; 3484 unsigned long low_kmem_size, total_size;
3490 struct zone *z; 3485 struct zone *z;
3491 int average_size; 3486 int average_size;
3492 /* 3487 /*
3493 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3488 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3494 * If they are really small and used heavily, the system can fall 3489 * If they are really small and used heavily, the system can fall
3495 * into OOM very easily. 3490 * into OOM very easily.
3496 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3491 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3497 */ 3492 */
3498 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3493 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3499 low_kmem_size = 0; 3494 low_kmem_size = 0;
3500 total_size = 0; 3495 total_size = 0;
3501 for_each_online_node(nid) { 3496 for_each_online_node(nid) {
3502 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3497 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3503 z = &NODE_DATA(nid)->node_zones[zone_type]; 3498 z = &NODE_DATA(nid)->node_zones[zone_type];
3504 if (populated_zone(z)) { 3499 if (populated_zone(z)) {
3505 if (zone_type < ZONE_NORMAL) 3500 if (zone_type < ZONE_NORMAL)
3506 low_kmem_size += z->managed_pages; 3501 low_kmem_size += z->managed_pages;
3507 total_size += z->managed_pages; 3502 total_size += z->managed_pages;
3508 } else if (zone_type == ZONE_NORMAL) { 3503 } else if (zone_type == ZONE_NORMAL) {
3509 /* 3504 /*
3510 * If any node has only lowmem, then node order 3505 * If any node has only lowmem, then node order
3511 * is preferred to allow kernel allocations 3506 * is preferred to allow kernel allocations
3512 * locally; otherwise, they can easily infringe 3507 * locally; otherwise, they can easily infringe
3513 * on other nodes when there is an abundance of 3508 * on other nodes when there is an abundance of
3514 * lowmem available to allocate from. 3509 * lowmem available to allocate from.
3515 */ 3510 */
3516 return ZONELIST_ORDER_NODE; 3511 return ZONELIST_ORDER_NODE;
3517 } 3512 }
3518 } 3513 }
3519 } 3514 }
3520 if (!low_kmem_size || /* there are no DMA area. */ 3515 if (!low_kmem_size || /* there are no DMA area. */
3521 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3516 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3522 return ZONELIST_ORDER_NODE; 3517 return ZONELIST_ORDER_NODE;
3523 /* 3518 /*
3524 * look into each node's config. 3519 * look into each node's config.
3525 * If there is a node whose DMA/DMA32 memory is very big area on 3520 * If there is a node whose DMA/DMA32 memory is very big area on
3526 * local memory, NODE_ORDER may be suitable. 3521 * local memory, NODE_ORDER may be suitable.
3527 */ 3522 */
3528 average_size = total_size / 3523 average_size = total_size /
3529 (nodes_weight(node_states[N_MEMORY]) + 1); 3524 (nodes_weight(node_states[N_MEMORY]) + 1);
3530 for_each_online_node(nid) { 3525 for_each_online_node(nid) {
3531 low_kmem_size = 0; 3526 low_kmem_size = 0;
3532 total_size = 0; 3527 total_size = 0;
3533 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3528 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3534 z = &NODE_DATA(nid)->node_zones[zone_type]; 3529 z = &NODE_DATA(nid)->node_zones[zone_type];
3535 if (populated_zone(z)) { 3530 if (populated_zone(z)) {
3536 if (zone_type < ZONE_NORMAL) 3531 if (zone_type < ZONE_NORMAL)
3537 low_kmem_size += z->present_pages; 3532 low_kmem_size += z->present_pages;
3538 total_size += z->present_pages; 3533 total_size += z->present_pages;
3539 } 3534 }
3540 } 3535 }
3541 if (low_kmem_size && 3536 if (low_kmem_size &&
3542 total_size > average_size && /* ignore small node */ 3537 total_size > average_size && /* ignore small node */
3543 low_kmem_size > total_size * 70/100) 3538 low_kmem_size > total_size * 70/100)
3544 return ZONELIST_ORDER_NODE; 3539 return ZONELIST_ORDER_NODE;
3545 } 3540 }
3546 return ZONELIST_ORDER_ZONE; 3541 return ZONELIST_ORDER_ZONE;
3547 } 3542 }
3548 3543
3549 static void set_zonelist_order(void) 3544 static void set_zonelist_order(void)
3550 { 3545 {
3551 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3546 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3552 current_zonelist_order = default_zonelist_order(); 3547 current_zonelist_order = default_zonelist_order();
3553 else 3548 else
3554 current_zonelist_order = user_zonelist_order; 3549 current_zonelist_order = user_zonelist_order;
3555 } 3550 }
3556 3551
3557 static void build_zonelists(pg_data_t *pgdat) 3552 static void build_zonelists(pg_data_t *pgdat)
3558 { 3553 {
3559 int j, node, load; 3554 int j, node, load;
3560 enum zone_type i; 3555 enum zone_type i;
3561 nodemask_t used_mask; 3556 nodemask_t used_mask;
3562 int local_node, prev_node; 3557 int local_node, prev_node;
3563 struct zonelist *zonelist; 3558 struct zonelist *zonelist;
3564 int order = current_zonelist_order; 3559 int order = current_zonelist_order;
3565 3560
3566 /* initialize zonelists */ 3561 /* initialize zonelists */
3567 for (i = 0; i < MAX_ZONELISTS; i++) { 3562 for (i = 0; i < MAX_ZONELISTS; i++) {
3568 zonelist = pgdat->node_zonelists + i; 3563 zonelist = pgdat->node_zonelists + i;
3569 zonelist->_zonerefs[0].zone = NULL; 3564 zonelist->_zonerefs[0].zone = NULL;
3570 zonelist->_zonerefs[0].zone_idx = 0; 3565 zonelist->_zonerefs[0].zone_idx = 0;
3571 } 3566 }
3572 3567
3573 /* NUMA-aware ordering of nodes */ 3568 /* NUMA-aware ordering of nodes */
3574 local_node = pgdat->node_id; 3569 local_node = pgdat->node_id;
3575 load = nr_online_nodes; 3570 load = nr_online_nodes;
3576 prev_node = local_node; 3571 prev_node = local_node;
3577 nodes_clear(used_mask); 3572 nodes_clear(used_mask);
3578 3573
3579 memset(node_order, 0, sizeof(node_order)); 3574 memset(node_order, 0, sizeof(node_order));
3580 j = 0; 3575 j = 0;
3581 3576
3582 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3577 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3583 /* 3578 /*
3584 * We don't want to pressure a particular node. 3579 * We don't want to pressure a particular node.
3585 * So adding penalty to the first node in same 3580 * So adding penalty to the first node in same
3586 * distance group to make it round-robin. 3581 * distance group to make it round-robin.
3587 */ 3582 */
3588 if (node_distance(local_node, node) != 3583 if (node_distance(local_node, node) !=
3589 node_distance(local_node, prev_node)) 3584 node_distance(local_node, prev_node))
3590 node_load[node] = load; 3585 node_load[node] = load;
3591 3586
3592 prev_node = node; 3587 prev_node = node;
3593 load--; 3588 load--;
3594 if (order == ZONELIST_ORDER_NODE) 3589 if (order == ZONELIST_ORDER_NODE)
3595 build_zonelists_in_node_order(pgdat, node); 3590 build_zonelists_in_node_order(pgdat, node);
3596 else 3591 else
3597 node_order[j++] = node; /* remember order */ 3592 node_order[j++] = node; /* remember order */
3598 } 3593 }
3599 3594
3600 if (order == ZONELIST_ORDER_ZONE) { 3595 if (order == ZONELIST_ORDER_ZONE) {
3601 /* calculate node order -- i.e., DMA last! */ 3596 /* calculate node order -- i.e., DMA last! */
3602 build_zonelists_in_zone_order(pgdat, j); 3597 build_zonelists_in_zone_order(pgdat, j);
3603 } 3598 }
3604 3599
3605 build_thisnode_zonelists(pgdat); 3600 build_thisnode_zonelists(pgdat);
3606 } 3601 }
3607 3602
3608 /* Construct the zonelist performance cache - see further mmzone.h */ 3603 /* Construct the zonelist performance cache - see further mmzone.h */
3609 static void build_zonelist_cache(pg_data_t *pgdat) 3604 static void build_zonelist_cache(pg_data_t *pgdat)
3610 { 3605 {
3611 struct zonelist *zonelist; 3606 struct zonelist *zonelist;
3612 struct zonelist_cache *zlc; 3607 struct zonelist_cache *zlc;
3613 struct zoneref *z; 3608 struct zoneref *z;
3614 3609
3615 zonelist = &pgdat->node_zonelists[0]; 3610 zonelist = &pgdat->node_zonelists[0];
3616 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3611 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3617 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3612 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3618 for (z = zonelist->_zonerefs; z->zone; z++) 3613 for (z = zonelist->_zonerefs; z->zone; z++)
3619 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3614 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3620 } 3615 }
3621 3616
3622 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3617 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3623 /* 3618 /*
3624 * Return node id of node used for "local" allocations. 3619 * Return node id of node used for "local" allocations.
3625 * I.e., first node id of first zone in arg node's generic zonelist. 3620 * I.e., first node id of first zone in arg node's generic zonelist.
3626 * Used for initializing percpu 'numa_mem', which is used primarily 3621 * Used for initializing percpu 'numa_mem', which is used primarily
3627 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3622 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3628 */ 3623 */
3629 int local_memory_node(int node) 3624 int local_memory_node(int node)
3630 { 3625 {
3631 struct zone *zone; 3626 struct zone *zone;
3632 3627
3633 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3628 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3634 gfp_zone(GFP_KERNEL), 3629 gfp_zone(GFP_KERNEL),
3635 NULL, 3630 NULL,
3636 &zone); 3631 &zone);
3637 return zone->node; 3632 return zone->node;
3638 } 3633 }
3639 #endif 3634 #endif
3640 3635
3641 #else /* CONFIG_NUMA */ 3636 #else /* CONFIG_NUMA */
3642 3637
3643 static void set_zonelist_order(void) 3638 static void set_zonelist_order(void)
3644 { 3639 {
3645 current_zonelist_order = ZONELIST_ORDER_ZONE; 3640 current_zonelist_order = ZONELIST_ORDER_ZONE;
3646 } 3641 }
3647 3642
3648 static void build_zonelists(pg_data_t *pgdat) 3643 static void build_zonelists(pg_data_t *pgdat)
3649 { 3644 {
3650 int node, local_node; 3645 int node, local_node;
3651 enum zone_type j; 3646 enum zone_type j;
3652 struct zonelist *zonelist; 3647 struct zonelist *zonelist;
3653 3648
3654 local_node = pgdat->node_id; 3649 local_node = pgdat->node_id;
3655 3650
3656 zonelist = &pgdat->node_zonelists[0]; 3651 zonelist = &pgdat->node_zonelists[0];
3657 j = build_zonelists_node(pgdat, zonelist, 0); 3652 j = build_zonelists_node(pgdat, zonelist, 0);
3658 3653
3659 /* 3654 /*
3660 * Now we build the zonelist so that it contains the zones 3655 * Now we build the zonelist so that it contains the zones
3661 * of all the other nodes. 3656 * of all the other nodes.
3662 * We don't want to pressure a particular node, so when 3657 * We don't want to pressure a particular node, so when
3663 * building the zones for node N, we make sure that the 3658 * building the zones for node N, we make sure that the
3664 * zones coming right after the local ones are those from 3659 * zones coming right after the local ones are those from
3665 * node N+1 (modulo N) 3660 * node N+1 (modulo N)
3666 */ 3661 */
3667 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3662 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3668 if (!node_online(node)) 3663 if (!node_online(node))
3669 continue; 3664 continue;
3670 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3665 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3671 } 3666 }
3672 for (node = 0; node < local_node; node++) { 3667 for (node = 0; node < local_node; node++) {
3673 if (!node_online(node)) 3668 if (!node_online(node))
3674 continue; 3669 continue;
3675 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3670 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3676 } 3671 }
3677 3672
3678 zonelist->_zonerefs[j].zone = NULL; 3673 zonelist->_zonerefs[j].zone = NULL;
3679 zonelist->_zonerefs[j].zone_idx = 0; 3674 zonelist->_zonerefs[j].zone_idx = 0;
3680 } 3675 }
3681 3676
3682 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3677 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3683 static void build_zonelist_cache(pg_data_t *pgdat) 3678 static void build_zonelist_cache(pg_data_t *pgdat)
3684 { 3679 {
3685 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3680 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3686 } 3681 }
3687 3682
3688 #endif /* CONFIG_NUMA */ 3683 #endif /* CONFIG_NUMA */
3689 3684
3690 /* 3685 /*
3691 * Boot pageset table. One per cpu which is going to be used for all 3686 * Boot pageset table. One per cpu which is going to be used for all
3692 * zones and all nodes. The parameters will be set in such a way 3687 * zones and all nodes. The parameters will be set in such a way
3693 * that an item put on a list will immediately be handed over to 3688 * that an item put on a list will immediately be handed over to
3694 * the buddy list. This is safe since pageset manipulation is done 3689 * the buddy list. This is safe since pageset manipulation is done
3695 * with interrupts disabled. 3690 * with interrupts disabled.
3696 * 3691 *
3697 * The boot_pagesets must be kept even after bootup is complete for 3692 * The boot_pagesets must be kept even after bootup is complete for
3698 * unused processors and/or zones. They do play a role for bootstrapping 3693 * unused processors and/or zones. They do play a role for bootstrapping
3699 * hotplugged processors. 3694 * hotplugged processors.
3700 * 3695 *
3701 * zoneinfo_show() and maybe other functions do 3696 * zoneinfo_show() and maybe other functions do
3702 * not check if the processor is online before following the pageset pointer. 3697 * not check if the processor is online before following the pageset pointer.
3703 * Other parts of the kernel may not check if the zone is available. 3698 * Other parts of the kernel may not check if the zone is available.
3704 */ 3699 */
3705 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3700 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3706 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3701 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3707 static void setup_zone_pageset(struct zone *zone); 3702 static void setup_zone_pageset(struct zone *zone);
3708 3703
3709 /* 3704 /*
3710 * Global mutex to protect against size modification of zonelists 3705 * Global mutex to protect against size modification of zonelists
3711 * as well as to serialize pageset setup for the new populated zone. 3706 * as well as to serialize pageset setup for the new populated zone.
3712 */ 3707 */
3713 DEFINE_MUTEX(zonelists_mutex); 3708 DEFINE_MUTEX(zonelists_mutex);
3714 3709
3715 /* return values int ....just for stop_machine() */ 3710 /* return values int ....just for stop_machine() */
3716 static int __build_all_zonelists(void *data) 3711 static int __build_all_zonelists(void *data)
3717 { 3712 {
3718 int nid; 3713 int nid;
3719 int cpu; 3714 int cpu;
3720 pg_data_t *self = data; 3715 pg_data_t *self = data;
3721 3716
3722 #ifdef CONFIG_NUMA 3717 #ifdef CONFIG_NUMA
3723 memset(node_load, 0, sizeof(node_load)); 3718 memset(node_load, 0, sizeof(node_load));
3724 #endif 3719 #endif
3725 3720
3726 if (self && !node_online(self->node_id)) { 3721 if (self && !node_online(self->node_id)) {
3727 build_zonelists(self); 3722 build_zonelists(self);
3728 build_zonelist_cache(self); 3723 build_zonelist_cache(self);
3729 } 3724 }
3730 3725
3731 for_each_online_node(nid) { 3726 for_each_online_node(nid) {
3732 pg_data_t *pgdat = NODE_DATA(nid); 3727 pg_data_t *pgdat = NODE_DATA(nid);
3733 3728
3734 build_zonelists(pgdat); 3729 build_zonelists(pgdat);
3735 build_zonelist_cache(pgdat); 3730 build_zonelist_cache(pgdat);
3736 } 3731 }
3737 3732
3738 /* 3733 /*
3739 * Initialize the boot_pagesets that are going to be used 3734 * Initialize the boot_pagesets that are going to be used
3740 * for bootstrapping processors. The real pagesets for 3735 * for bootstrapping processors. The real pagesets for
3741 * each zone will be allocated later when the per cpu 3736 * each zone will be allocated later when the per cpu
3742 * allocator is available. 3737 * allocator is available.
3743 * 3738 *
3744 * boot_pagesets are used also for bootstrapping offline 3739 * boot_pagesets are used also for bootstrapping offline
3745 * cpus if the system is already booted because the pagesets 3740 * cpus if the system is already booted because the pagesets
3746 * are needed to initialize allocators on a specific cpu too. 3741 * are needed to initialize allocators on a specific cpu too.
3747 * F.e. the percpu allocator needs the page allocator which 3742 * F.e. the percpu allocator needs the page allocator which
3748 * needs the percpu allocator in order to allocate its pagesets 3743 * needs the percpu allocator in order to allocate its pagesets
3749 * (a chicken-egg dilemma). 3744 * (a chicken-egg dilemma).
3750 */ 3745 */
3751 for_each_possible_cpu(cpu) { 3746 for_each_possible_cpu(cpu) {
3752 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3747 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3753 3748
3754 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3749 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3755 /* 3750 /*
3756 * We now know the "local memory node" for each node-- 3751 * We now know the "local memory node" for each node--
3757 * i.e., the node of the first zone in the generic zonelist. 3752 * i.e., the node of the first zone in the generic zonelist.
3758 * Set up numa_mem percpu variable for on-line cpus. During 3753 * Set up numa_mem percpu variable for on-line cpus. During
3759 * boot, only the boot cpu should be on-line; we'll init the 3754 * boot, only the boot cpu should be on-line; we'll init the
3760 * secondary cpus' numa_mem as they come on-line. During 3755 * secondary cpus' numa_mem as they come on-line. During
3761 * node/memory hotplug, we'll fixup all on-line cpus. 3756 * node/memory hotplug, we'll fixup all on-line cpus.
3762 */ 3757 */
3763 if (cpu_online(cpu)) 3758 if (cpu_online(cpu))
3764 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3759 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3765 #endif 3760 #endif
3766 } 3761 }
3767 3762
3768 return 0; 3763 return 0;
3769 } 3764 }
3770 3765
3771 /* 3766 /*
3772 * Called with zonelists_mutex held always 3767 * Called with zonelists_mutex held always
3773 * unless system_state == SYSTEM_BOOTING. 3768 * unless system_state == SYSTEM_BOOTING.
3774 */ 3769 */
3775 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3770 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3776 { 3771 {
3777 set_zonelist_order(); 3772 set_zonelist_order();
3778 3773
3779 if (system_state == SYSTEM_BOOTING) { 3774 if (system_state == SYSTEM_BOOTING) {
3780 __build_all_zonelists(NULL); 3775 __build_all_zonelists(NULL);
3781 mminit_verify_zonelist(); 3776 mminit_verify_zonelist();
3782 cpuset_init_current_mems_allowed(); 3777 cpuset_init_current_mems_allowed();
3783 } else { 3778 } else {
3784 #ifdef CONFIG_MEMORY_HOTPLUG 3779 #ifdef CONFIG_MEMORY_HOTPLUG
3785 if (zone) 3780 if (zone)
3786 setup_zone_pageset(zone); 3781 setup_zone_pageset(zone);
3787 #endif 3782 #endif
3788 /* we have to stop all cpus to guarantee there is no user 3783 /* we have to stop all cpus to guarantee there is no user
3789 of zonelist */ 3784 of zonelist */
3790 stop_machine(__build_all_zonelists, pgdat, NULL); 3785 stop_machine(__build_all_zonelists, pgdat, NULL);
3791 /* cpuset refresh routine should be here */ 3786 /* cpuset refresh routine should be here */
3792 } 3787 }
3793 vm_total_pages = nr_free_pagecache_pages(); 3788 vm_total_pages = nr_free_pagecache_pages();
3794 /* 3789 /*
3795 * Disable grouping by mobility if the number of pages in the 3790 * Disable grouping by mobility if the number of pages in the
3796 * system is too low to allow the mechanism to work. It would be 3791 * system is too low to allow the mechanism to work. It would be
3797 * more accurate, but expensive to check per-zone. This check is 3792 * more accurate, but expensive to check per-zone. This check is
3798 * made on memory-hotadd so a system can start with mobility 3793 * made on memory-hotadd so a system can start with mobility
3799 * disabled and enable it later 3794 * disabled and enable it later
3800 */ 3795 */
3801 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3796 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3802 page_group_by_mobility_disabled = 1; 3797 page_group_by_mobility_disabled = 1;
3803 else 3798 else
3804 page_group_by_mobility_disabled = 0; 3799 page_group_by_mobility_disabled = 0;
3805 3800
3806 printk("Built %i zonelists in %s order, mobility grouping %s. " 3801 printk("Built %i zonelists in %s order, mobility grouping %s. "
3807 "Total pages: %ld\n", 3802 "Total pages: %ld\n",
3808 nr_online_nodes, 3803 nr_online_nodes,
3809 zonelist_order_name[current_zonelist_order], 3804 zonelist_order_name[current_zonelist_order],
3810 page_group_by_mobility_disabled ? "off" : "on", 3805 page_group_by_mobility_disabled ? "off" : "on",
3811 vm_total_pages); 3806 vm_total_pages);
3812 #ifdef CONFIG_NUMA 3807 #ifdef CONFIG_NUMA
3813 printk("Policy zone: %s\n", zone_names[policy_zone]); 3808 printk("Policy zone: %s\n", zone_names[policy_zone]);
3814 #endif 3809 #endif
3815 } 3810 }
3816 3811
3817 /* 3812 /*
3818 * Helper functions to size the waitqueue hash table. 3813 * Helper functions to size the waitqueue hash table.
3819 * Essentially these want to choose hash table sizes sufficiently 3814 * Essentially these want to choose hash table sizes sufficiently
3820 * large so that collisions trying to wait on pages are rare. 3815 * large so that collisions trying to wait on pages are rare.
3821 * But in fact, the number of active page waitqueues on typical 3816 * But in fact, the number of active page waitqueues on typical
3822 * systems is ridiculously low, less than 200. So this is even 3817 * systems is ridiculously low, less than 200. So this is even
3823 * conservative, even though it seems large. 3818 * conservative, even though it seems large.
3824 * 3819 *
3825 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3820 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3826 * waitqueues, i.e. the size of the waitq table given the number of pages. 3821 * waitqueues, i.e. the size of the waitq table given the number of pages.
3827 */ 3822 */
3828 #define PAGES_PER_WAITQUEUE 256 3823 #define PAGES_PER_WAITQUEUE 256
3829 3824
3830 #ifndef CONFIG_MEMORY_HOTPLUG 3825 #ifndef CONFIG_MEMORY_HOTPLUG
3831 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3826 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3832 { 3827 {
3833 unsigned long size = 1; 3828 unsigned long size = 1;
3834 3829
3835 pages /= PAGES_PER_WAITQUEUE; 3830 pages /= PAGES_PER_WAITQUEUE;
3836 3831
3837 while (size < pages) 3832 while (size < pages)
3838 size <<= 1; 3833 size <<= 1;
3839 3834
3840 /* 3835 /*
3841 * Once we have dozens or even hundreds of threads sleeping 3836 * Once we have dozens or even hundreds of threads sleeping
3842 * on IO we've got bigger problems than wait queue collision. 3837 * on IO we've got bigger problems than wait queue collision.
3843 * Limit the size of the wait table to a reasonable size. 3838 * Limit the size of the wait table to a reasonable size.
3844 */ 3839 */
3845 size = min(size, 4096UL); 3840 size = min(size, 4096UL);
3846 3841
3847 return max(size, 4UL); 3842 return max(size, 4UL);
3848 } 3843 }
3849 #else 3844 #else
3850 /* 3845 /*
3851 * A zone's size might be changed by hot-add, so it is not possible to determine 3846 * A zone's size might be changed by hot-add, so it is not possible to determine
3852 * a suitable size for its wait_table. So we use the maximum size now. 3847 * a suitable size for its wait_table. So we use the maximum size now.
3853 * 3848 *
3854 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3849 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3855 * 3850 *
3856 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3851 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3857 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3852 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3858 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3853 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3859 * 3854 *
3860 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3855 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3861 * or more by the traditional way. (See above). It equals: 3856 * or more by the traditional way. (See above). It equals:
3862 * 3857 *
3863 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3858 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3864 * ia64(16K page size) : = ( 8G + 4M)byte. 3859 * ia64(16K page size) : = ( 8G + 4M)byte.
3865 * powerpc (64K page size) : = (32G +16M)byte. 3860 * powerpc (64K page size) : = (32G +16M)byte.
3866 */ 3861 */
3867 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3862 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3868 { 3863 {
3869 return 4096UL; 3864 return 4096UL;
3870 } 3865 }
3871 #endif 3866 #endif
3872 3867
3873 /* 3868 /*
3874 * This is an integer logarithm so that shifts can be used later 3869 * This is an integer logarithm so that shifts can be used later
3875 * to extract the more random high bits from the multiplicative 3870 * to extract the more random high bits from the multiplicative
3876 * hash function before the remainder is taken. 3871 * hash function before the remainder is taken.
3877 */ 3872 */
3878 static inline unsigned long wait_table_bits(unsigned long size) 3873 static inline unsigned long wait_table_bits(unsigned long size)
3879 { 3874 {
3880 return ffz(~size); 3875 return ffz(~size);
3881 } 3876 }
3882 3877
3883 /* 3878 /*
3884 * Check if a pageblock contains reserved pages 3879 * Check if a pageblock contains reserved pages
3885 */ 3880 */
3886 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3881 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3887 { 3882 {
3888 unsigned long pfn; 3883 unsigned long pfn;
3889 3884
3890 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3885 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3891 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3886 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3892 return 1; 3887 return 1;
3893 } 3888 }
3894 return 0; 3889 return 0;
3895 } 3890 }
3896 3891
3897 /* 3892 /*
3898 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3893 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3899 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3894 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3900 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3895 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3901 * higher will lead to a bigger reserve which will get freed as contiguous 3896 * higher will lead to a bigger reserve which will get freed as contiguous
3902 * blocks as reclaim kicks in 3897 * blocks as reclaim kicks in
3903 */ 3898 */
3904 static void setup_zone_migrate_reserve(struct zone *zone) 3899 static void setup_zone_migrate_reserve(struct zone *zone)
3905 { 3900 {
3906 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3901 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3907 struct page *page; 3902 struct page *page;
3908 unsigned long block_migratetype; 3903 unsigned long block_migratetype;
3909 int reserve; 3904 int reserve;
3910 3905
3911 /* 3906 /*
3912 * Get the start pfn, end pfn and the number of blocks to reserve 3907 * Get the start pfn, end pfn and the number of blocks to reserve
3913 * We have to be careful to be aligned to pageblock_nr_pages to 3908 * We have to be careful to be aligned to pageblock_nr_pages to
3914 * make sure that we always check pfn_valid for the first page in 3909 * make sure that we always check pfn_valid for the first page in
3915 * the block. 3910 * the block.
3916 */ 3911 */
3917 start_pfn = zone->zone_start_pfn; 3912 start_pfn = zone->zone_start_pfn;
3918 end_pfn = zone_end_pfn(zone); 3913 end_pfn = zone_end_pfn(zone);
3919 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3914 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3920 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3915 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3921 pageblock_order; 3916 pageblock_order;
3922 3917
3923 /* 3918 /*
3924 * Reserve blocks are generally in place to help high-order atomic 3919 * Reserve blocks are generally in place to help high-order atomic
3925 * allocations that are short-lived. A min_free_kbytes value that 3920 * allocations that are short-lived. A min_free_kbytes value that
3926 * would result in more than 2 reserve blocks for atomic allocations 3921 * would result in more than 2 reserve blocks for atomic allocations
3927 * is assumed to be in place to help anti-fragmentation for the 3922 * is assumed to be in place to help anti-fragmentation for the
3928 * future allocation of hugepages at runtime. 3923 * future allocation of hugepages at runtime.
3929 */ 3924 */
3930 reserve = min(2, reserve); 3925 reserve = min(2, reserve);
3931 3926
3932 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3927 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3933 if (!pfn_valid(pfn)) 3928 if (!pfn_valid(pfn))
3934 continue; 3929 continue;
3935 page = pfn_to_page(pfn); 3930 page = pfn_to_page(pfn);
3936 3931
3937 /* Watch out for overlapping nodes */ 3932 /* Watch out for overlapping nodes */
3938 if (page_to_nid(page) != zone_to_nid(zone)) 3933 if (page_to_nid(page) != zone_to_nid(zone))
3939 continue; 3934 continue;
3940 3935
3941 block_migratetype = get_pageblock_migratetype(page); 3936 block_migratetype = get_pageblock_migratetype(page);
3942 3937
3943 /* Only test what is necessary when the reserves are not met */ 3938 /* Only test what is necessary when the reserves are not met */
3944 if (reserve > 0) { 3939 if (reserve > 0) {
3945 /* 3940 /*
3946 * Blocks with reserved pages will never free, skip 3941 * Blocks with reserved pages will never free, skip
3947 * them. 3942 * them.
3948 */ 3943 */
3949 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3944 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3950 if (pageblock_is_reserved(pfn, block_end_pfn)) 3945 if (pageblock_is_reserved(pfn, block_end_pfn))
3951 continue; 3946 continue;
3952 3947
3953 /* If this block is reserved, account for it */ 3948 /* If this block is reserved, account for it */
3954 if (block_migratetype == MIGRATE_RESERVE) { 3949 if (block_migratetype == MIGRATE_RESERVE) {
3955 reserve--; 3950 reserve--;
3956 continue; 3951 continue;
3957 } 3952 }
3958 3953
3959 /* Suitable for reserving if this block is movable */ 3954 /* Suitable for reserving if this block is movable */
3960 if (block_migratetype == MIGRATE_MOVABLE) { 3955 if (block_migratetype == MIGRATE_MOVABLE) {
3961 set_pageblock_migratetype(page, 3956 set_pageblock_migratetype(page,
3962 MIGRATE_RESERVE); 3957 MIGRATE_RESERVE);
3963 move_freepages_block(zone, page, 3958 move_freepages_block(zone, page,
3964 MIGRATE_RESERVE); 3959 MIGRATE_RESERVE);
3965 reserve--; 3960 reserve--;
3966 continue; 3961 continue;
3967 } 3962 }
3968 } 3963 }
3969 3964
3970 /* 3965 /*
3971 * If the reserve is met and this is a previous reserved block, 3966 * If the reserve is met and this is a previous reserved block,
3972 * take it back 3967 * take it back
3973 */ 3968 */
3974 if (block_migratetype == MIGRATE_RESERVE) { 3969 if (block_migratetype == MIGRATE_RESERVE) {
3975 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3970 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3976 move_freepages_block(zone, page, MIGRATE_MOVABLE); 3971 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3977 } 3972 }
3978 } 3973 }
3979 } 3974 }
3980 3975
3981 /* 3976 /*
3982 * Initially all pages are reserved - free ones are freed 3977 * Initially all pages are reserved - free ones are freed
3983 * up by free_all_bootmem() once the early boot process is 3978 * up by free_all_bootmem() once the early boot process is
3984 * done. Non-atomic initialization, single-pass. 3979 * done. Non-atomic initialization, single-pass.
3985 */ 3980 */
3986 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 3981 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3987 unsigned long start_pfn, enum memmap_context context) 3982 unsigned long start_pfn, enum memmap_context context)
3988 { 3983 {
3989 struct page *page; 3984 struct page *page;
3990 unsigned long end_pfn = start_pfn + size; 3985 unsigned long end_pfn = start_pfn + size;
3991 unsigned long pfn; 3986 unsigned long pfn;
3992 struct zone *z; 3987 struct zone *z;
3993 3988
3994 if (highest_memmap_pfn < end_pfn - 1) 3989 if (highest_memmap_pfn < end_pfn - 1)
3995 highest_memmap_pfn = end_pfn - 1; 3990 highest_memmap_pfn = end_pfn - 1;
3996 3991
3997 z = &NODE_DATA(nid)->node_zones[zone]; 3992 z = &NODE_DATA(nid)->node_zones[zone];
3998 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3993 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3999 /* 3994 /*
4000 * There can be holes in boot-time mem_map[]s 3995 * There can be holes in boot-time mem_map[]s
4001 * handed to this function. They do not 3996 * handed to this function. They do not
4002 * exist on hotplugged memory. 3997 * exist on hotplugged memory.
4003 */ 3998 */
4004 if (context == MEMMAP_EARLY) { 3999 if (context == MEMMAP_EARLY) {
4005 if (!early_pfn_valid(pfn)) 4000 if (!early_pfn_valid(pfn))
4006 continue; 4001 continue;
4007 if (!early_pfn_in_nid(pfn, nid)) 4002 if (!early_pfn_in_nid(pfn, nid))
4008 continue; 4003 continue;
4009 } 4004 }
4010 page = pfn_to_page(pfn); 4005 page = pfn_to_page(pfn);
4011 set_page_links(page, zone, nid, pfn); 4006 set_page_links(page, zone, nid, pfn);
4012 mminit_verify_page_links(page, zone, nid, pfn); 4007 mminit_verify_page_links(page, zone, nid, pfn);
4013 init_page_count(page); 4008 init_page_count(page);
4014 page_mapcount_reset(page); 4009 page_mapcount_reset(page);
4015 page_cpupid_reset_last(page); 4010 page_cpupid_reset_last(page);
4016 SetPageReserved(page); 4011 SetPageReserved(page);
4017 /* 4012 /*
4018 * Mark the block movable so that blocks are reserved for 4013 * Mark the block movable so that blocks are reserved for
4019 * movable at startup. This will force kernel allocations 4014 * movable at startup. This will force kernel allocations
4020 * to reserve their blocks rather than leaking throughout 4015 * to reserve their blocks rather than leaking throughout
4021 * the address space during boot when many long-lived 4016 * the address space during boot when many long-lived
4022 * kernel allocations are made. Later some blocks near 4017 * kernel allocations are made. Later some blocks near
4023 * the start are marked MIGRATE_RESERVE by 4018 * the start are marked MIGRATE_RESERVE by
4024 * setup_zone_migrate_reserve() 4019 * setup_zone_migrate_reserve()
4025 * 4020 *
4026 * bitmap is created for zone's valid pfn range. but memmap 4021 * bitmap is created for zone's valid pfn range. but memmap
4027 * can be created for invalid pages (for alignment) 4022 * can be created for invalid pages (for alignment)
4028 * check here not to call set_pageblock_migratetype() against 4023 * check here not to call set_pageblock_migratetype() against
4029 * pfn out of zone. 4024 * pfn out of zone.
4030 */ 4025 */
4031 if ((z->zone_start_pfn <= pfn) 4026 if ((z->zone_start_pfn <= pfn)
4032 && (pfn < zone_end_pfn(z)) 4027 && (pfn < zone_end_pfn(z))
4033 && !(pfn & (pageblock_nr_pages - 1))) 4028 && !(pfn & (pageblock_nr_pages - 1)))
4034 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4029 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4035 4030
4036 INIT_LIST_HEAD(&page->lru); 4031 INIT_LIST_HEAD(&page->lru);
4037 #ifdef WANT_PAGE_VIRTUAL 4032 #ifdef WANT_PAGE_VIRTUAL
4038 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4033 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4039 if (!is_highmem_idx(zone)) 4034 if (!is_highmem_idx(zone))
4040 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4035 set_page_address(page, __va(pfn << PAGE_SHIFT));
4041 #endif 4036 #endif
4042 } 4037 }
4043 } 4038 }
4044 4039
4045 static void __meminit zone_init_free_lists(struct zone *zone) 4040 static void __meminit zone_init_free_lists(struct zone *zone)
4046 { 4041 {
4047 int order, t; 4042 int order, t;
4048 for_each_migratetype_order(order, t) { 4043 for_each_migratetype_order(order, t) {
4049 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4044 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4050 zone->free_area[order].nr_free = 0; 4045 zone->free_area[order].nr_free = 0;
4051 } 4046 }
4052 } 4047 }
4053 4048
4054 #ifndef __HAVE_ARCH_MEMMAP_INIT 4049 #ifndef __HAVE_ARCH_MEMMAP_INIT
4055 #define memmap_init(size, nid, zone, start_pfn) \ 4050 #define memmap_init(size, nid, zone, start_pfn) \
4056 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4051 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4057 #endif 4052 #endif
4058 4053
4059 static int __meminit zone_batchsize(struct zone *zone) 4054 static int __meminit zone_batchsize(struct zone *zone)
4060 { 4055 {
4061 #ifdef CONFIG_MMU 4056 #ifdef CONFIG_MMU
4062 int batch; 4057 int batch;
4063 4058
4064 /* 4059 /*
4065 * The per-cpu-pages pools are set to around 1000th of the 4060 * The per-cpu-pages pools are set to around 1000th of the
4066 * size of the zone. But no more than 1/2 of a meg. 4061 * size of the zone. But no more than 1/2 of a meg.
4067 * 4062 *
4068 * OK, so we don't know how big the cache is. So guess. 4063 * OK, so we don't know how big the cache is. So guess.
4069 */ 4064 */
4070 batch = zone->managed_pages / 1024; 4065 batch = zone->managed_pages / 1024;
4071 if (batch * PAGE_SIZE > 512 * 1024) 4066 if (batch * PAGE_SIZE > 512 * 1024)
4072 batch = (512 * 1024) / PAGE_SIZE; 4067 batch = (512 * 1024) / PAGE_SIZE;
4073 batch /= 4; /* We effectively *= 4 below */ 4068 batch /= 4; /* We effectively *= 4 below */
4074 if (batch < 1) 4069 if (batch < 1)
4075 batch = 1; 4070 batch = 1;
4076 4071
4077 /* 4072 /*
4078 * Clamp the batch to a 2^n - 1 value. Having a power 4073 * Clamp the batch to a 2^n - 1 value. Having a power
4079 * of 2 value was found to be more likely to have 4074 * of 2 value was found to be more likely to have
4080 * suboptimal cache aliasing properties in some cases. 4075 * suboptimal cache aliasing properties in some cases.
4081 * 4076 *
4082 * For example if 2 tasks are alternately allocating 4077 * For example if 2 tasks are alternately allocating
4083 * batches of pages, one task can end up with a lot 4078 * batches of pages, one task can end up with a lot
4084 * of pages of one half of the possible page colors 4079 * of pages of one half of the possible page colors
4085 * and the other with pages of the other colors. 4080 * and the other with pages of the other colors.
4086 */ 4081 */
4087 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4082 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4088 4083
4089 return batch; 4084 return batch;
4090 4085
4091 #else 4086 #else
4092 /* The deferral and batching of frees should be suppressed under NOMMU 4087 /* The deferral and batching of frees should be suppressed under NOMMU
4093 * conditions. 4088 * conditions.
4094 * 4089 *
4095 * The problem is that NOMMU needs to be able to allocate large chunks 4090 * The problem is that NOMMU needs to be able to allocate large chunks
4096 * of contiguous memory as there's no hardware page translation to 4091 * of contiguous memory as there's no hardware page translation to
4097 * assemble apparent contiguous memory from discontiguous pages. 4092 * assemble apparent contiguous memory from discontiguous pages.
4098 * 4093 *
4099 * Queueing large contiguous runs of pages for batching, however, 4094 * Queueing large contiguous runs of pages for batching, however,
4100 * causes the pages to actually be freed in smaller chunks. As there 4095 * causes the pages to actually be freed in smaller chunks. As there
4101 * can be a significant delay between the individual batches being 4096 * can be a significant delay between the individual batches being
4102 * recycled, this leads to the once large chunks of space being 4097 * recycled, this leads to the once large chunks of space being
4103 * fragmented and becoming unavailable for high-order allocations. 4098 * fragmented and becoming unavailable for high-order allocations.
4104 */ 4099 */
4105 return 0; 4100 return 0;
4106 #endif 4101 #endif
4107 } 4102 }
4108 4103
4109 /* 4104 /*
4110 * pcp->high and pcp->batch values are related and dependent on one another: 4105 * pcp->high and pcp->batch values are related and dependent on one another:
4111 * ->batch must never be higher then ->high. 4106 * ->batch must never be higher then ->high.
4112 * The following function updates them in a safe manner without read side 4107 * The following function updates them in a safe manner without read side
4113 * locking. 4108 * locking.
4114 * 4109 *
4115 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4110 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4116 * those fields changing asynchronously (acording the the above rule). 4111 * those fields changing asynchronously (acording the the above rule).
4117 * 4112 *
4118 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4113 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4119 * outside of boot time (or some other assurance that no concurrent updaters 4114 * outside of boot time (or some other assurance that no concurrent updaters
4120 * exist). 4115 * exist).
4121 */ 4116 */
4122 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4117 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4123 unsigned long batch) 4118 unsigned long batch)
4124 { 4119 {
4125 /* start with a fail safe value for batch */ 4120 /* start with a fail safe value for batch */
4126 pcp->batch = 1; 4121 pcp->batch = 1;
4127 smp_wmb(); 4122 smp_wmb();
4128 4123
4129 /* Update high, then batch, in order */ 4124 /* Update high, then batch, in order */
4130 pcp->high = high; 4125 pcp->high = high;
4131 smp_wmb(); 4126 smp_wmb();
4132 4127
4133 pcp->batch = batch; 4128 pcp->batch = batch;
4134 } 4129 }
4135 4130
4136 /* a companion to pageset_set_high() */ 4131 /* a companion to pageset_set_high() */
4137 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4132 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4138 { 4133 {
4139 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4134 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4140 } 4135 }
4141 4136
4142 static void pageset_init(struct per_cpu_pageset *p) 4137 static void pageset_init(struct per_cpu_pageset *p)
4143 { 4138 {
4144 struct per_cpu_pages *pcp; 4139 struct per_cpu_pages *pcp;
4145 int migratetype; 4140 int migratetype;
4146 4141
4147 memset(p, 0, sizeof(*p)); 4142 memset(p, 0, sizeof(*p));
4148 4143
4149 pcp = &p->pcp; 4144 pcp = &p->pcp;
4150 pcp->count = 0; 4145 pcp->count = 0;
4151 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4146 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4152 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4147 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4153 } 4148 }
4154 4149
4155 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4150 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4156 { 4151 {
4157 pageset_init(p); 4152 pageset_init(p);
4158 pageset_set_batch(p, batch); 4153 pageset_set_batch(p, batch);
4159 } 4154 }
4160 4155
4161 /* 4156 /*
4162 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4157 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4163 * to the value high for the pageset p. 4158 * to the value high for the pageset p.
4164 */ 4159 */
4165 static void pageset_set_high(struct per_cpu_pageset *p, 4160 static void pageset_set_high(struct per_cpu_pageset *p,
4166 unsigned long high) 4161 unsigned long high)
4167 { 4162 {
4168 unsigned long batch = max(1UL, high / 4); 4163 unsigned long batch = max(1UL, high / 4);
4169 if ((high / 4) > (PAGE_SHIFT * 8)) 4164 if ((high / 4) > (PAGE_SHIFT * 8))
4170 batch = PAGE_SHIFT * 8; 4165 batch = PAGE_SHIFT * 8;
4171 4166
4172 pageset_update(&p->pcp, high, batch); 4167 pageset_update(&p->pcp, high, batch);
4173 } 4168 }
4174 4169
4175 static void __meminit pageset_set_high_and_batch(struct zone *zone, 4170 static void __meminit pageset_set_high_and_batch(struct zone *zone,
4176 struct per_cpu_pageset *pcp) 4171 struct per_cpu_pageset *pcp)
4177 { 4172 {
4178 if (percpu_pagelist_fraction) 4173 if (percpu_pagelist_fraction)
4179 pageset_set_high(pcp, 4174 pageset_set_high(pcp,
4180 (zone->managed_pages / 4175 (zone->managed_pages /
4181 percpu_pagelist_fraction)); 4176 percpu_pagelist_fraction));
4182 else 4177 else
4183 pageset_set_batch(pcp, zone_batchsize(zone)); 4178 pageset_set_batch(pcp, zone_batchsize(zone));
4184 } 4179 }
4185 4180
4186 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4181 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4187 { 4182 {
4188 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4183 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4189 4184
4190 pageset_init(pcp); 4185 pageset_init(pcp);
4191 pageset_set_high_and_batch(zone, pcp); 4186 pageset_set_high_and_batch(zone, pcp);
4192 } 4187 }
4193 4188
4194 static void __meminit setup_zone_pageset(struct zone *zone) 4189 static void __meminit setup_zone_pageset(struct zone *zone)
4195 { 4190 {
4196 int cpu; 4191 int cpu;
4197 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4192 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4198 for_each_possible_cpu(cpu) 4193 for_each_possible_cpu(cpu)
4199 zone_pageset_init(zone, cpu); 4194 zone_pageset_init(zone, cpu);
4200 } 4195 }
4201 4196
4202 /* 4197 /*
4203 * Allocate per cpu pagesets and initialize them. 4198 * Allocate per cpu pagesets and initialize them.
4204 * Before this call only boot pagesets were available. 4199 * Before this call only boot pagesets were available.
4205 */ 4200 */
4206 void __init setup_per_cpu_pageset(void) 4201 void __init setup_per_cpu_pageset(void)
4207 { 4202 {
4208 struct zone *zone; 4203 struct zone *zone;
4209 4204
4210 for_each_populated_zone(zone) 4205 for_each_populated_zone(zone)
4211 setup_zone_pageset(zone); 4206 setup_zone_pageset(zone);
4212 } 4207 }
4213 4208
4214 static noinline __init_refok 4209 static noinline __init_refok
4215 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4210 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4216 { 4211 {
4217 int i; 4212 int i;
4218 struct pglist_data *pgdat = zone->zone_pgdat; 4213 struct pglist_data *pgdat = zone->zone_pgdat;
4219 size_t alloc_size; 4214 size_t alloc_size;
4220 4215
4221 /* 4216 /*
4222 * The per-page waitqueue mechanism uses hashed waitqueues 4217 * The per-page waitqueue mechanism uses hashed waitqueues
4223 * per zone. 4218 * per zone.
4224 */ 4219 */
4225 zone->wait_table_hash_nr_entries = 4220 zone->wait_table_hash_nr_entries =
4226 wait_table_hash_nr_entries(zone_size_pages); 4221 wait_table_hash_nr_entries(zone_size_pages);
4227 zone->wait_table_bits = 4222 zone->wait_table_bits =
4228 wait_table_bits(zone->wait_table_hash_nr_entries); 4223 wait_table_bits(zone->wait_table_hash_nr_entries);
4229 alloc_size = zone->wait_table_hash_nr_entries 4224 alloc_size = zone->wait_table_hash_nr_entries
4230 * sizeof(wait_queue_head_t); 4225 * sizeof(wait_queue_head_t);
4231 4226
4232 if (!slab_is_available()) { 4227 if (!slab_is_available()) {
4233 zone->wait_table = (wait_queue_head_t *) 4228 zone->wait_table = (wait_queue_head_t *)
4234 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4229 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4235 } else { 4230 } else {
4236 /* 4231 /*
4237 * This case means that a zone whose size was 0 gets new memory 4232 * This case means that a zone whose size was 0 gets new memory
4238 * via memory hot-add. 4233 * via memory hot-add.
4239 * But it may be the case that a new node was hot-added. In 4234 * But it may be the case that a new node was hot-added. In
4240 * this case vmalloc() will not be able to use this new node's 4235 * this case vmalloc() will not be able to use this new node's
4241 * memory - this wait_table must be initialized to use this new 4236 * memory - this wait_table must be initialized to use this new
4242 * node itself as well. 4237 * node itself as well.
4243 * To use this new node's memory, further consideration will be 4238 * To use this new node's memory, further consideration will be
4244 * necessary. 4239 * necessary.
4245 */ 4240 */
4246 zone->wait_table = vmalloc(alloc_size); 4241 zone->wait_table = vmalloc(alloc_size);
4247 } 4242 }
4248 if (!zone->wait_table) 4243 if (!zone->wait_table)
4249 return -ENOMEM; 4244 return -ENOMEM;
4250 4245
4251 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4246 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4252 init_waitqueue_head(zone->wait_table + i); 4247 init_waitqueue_head(zone->wait_table + i);
4253 4248
4254 return 0; 4249 return 0;
4255 } 4250 }
4256 4251
4257 static __meminit void zone_pcp_init(struct zone *zone) 4252 static __meminit void zone_pcp_init(struct zone *zone)
4258 { 4253 {
4259 /* 4254 /*
4260 * per cpu subsystem is not up at this point. The following code 4255 * per cpu subsystem is not up at this point. The following code
4261 * relies on the ability of the linker to provide the 4256 * relies on the ability of the linker to provide the
4262 * offset of a (static) per cpu variable into the per cpu area. 4257 * offset of a (static) per cpu variable into the per cpu area.
4263 */ 4258 */
4264 zone->pageset = &boot_pageset; 4259 zone->pageset = &boot_pageset;
4265 4260
4266 if (populated_zone(zone)) 4261 if (populated_zone(zone))
4267 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4262 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4268 zone->name, zone->present_pages, 4263 zone->name, zone->present_pages,
4269 zone_batchsize(zone)); 4264 zone_batchsize(zone));
4270 } 4265 }
4271 4266
4272 int __meminit init_currently_empty_zone(struct zone *zone, 4267 int __meminit init_currently_empty_zone(struct zone *zone,
4273 unsigned long zone_start_pfn, 4268 unsigned long zone_start_pfn,
4274 unsigned long size, 4269 unsigned long size,
4275 enum memmap_context context) 4270 enum memmap_context context)
4276 { 4271 {
4277 struct pglist_data *pgdat = zone->zone_pgdat; 4272 struct pglist_data *pgdat = zone->zone_pgdat;
4278 int ret; 4273 int ret;
4279 ret = zone_wait_table_init(zone, size); 4274 ret = zone_wait_table_init(zone, size);
4280 if (ret) 4275 if (ret)
4281 return ret; 4276 return ret;
4282 pgdat->nr_zones = zone_idx(zone) + 1; 4277 pgdat->nr_zones = zone_idx(zone) + 1;
4283 4278
4284 zone->zone_start_pfn = zone_start_pfn; 4279 zone->zone_start_pfn = zone_start_pfn;
4285 4280
4286 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4281 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4287 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4282 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4288 pgdat->node_id, 4283 pgdat->node_id,
4289 (unsigned long)zone_idx(zone), 4284 (unsigned long)zone_idx(zone),
4290 zone_start_pfn, (zone_start_pfn + size)); 4285 zone_start_pfn, (zone_start_pfn + size));
4291 4286
4292 zone_init_free_lists(zone); 4287 zone_init_free_lists(zone);
4293 4288
4294 return 0; 4289 return 0;
4295 } 4290 }
4296 4291
4297 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4292 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4298 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4293 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4299 /* 4294 /*
4300 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4295 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4301 * Architectures may implement their own version but if add_active_range() 4296 * Architectures may implement their own version but if add_active_range()
4302 * was used and there are no special requirements, this is a convenient 4297 * was used and there are no special requirements, this is a convenient
4303 * alternative 4298 * alternative
4304 */ 4299 */
4305 int __meminit __early_pfn_to_nid(unsigned long pfn) 4300 int __meminit __early_pfn_to_nid(unsigned long pfn)
4306 { 4301 {
4307 unsigned long start_pfn, end_pfn; 4302 unsigned long start_pfn, end_pfn;
4308 int nid; 4303 int nid;
4309 /* 4304 /*
4310 * NOTE: The following SMP-unsafe globals are only used early in boot 4305 * NOTE: The following SMP-unsafe globals are only used early in boot
4311 * when the kernel is running single-threaded. 4306 * when the kernel is running single-threaded.
4312 */ 4307 */
4313 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4308 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4314 static int __meminitdata last_nid; 4309 static int __meminitdata last_nid;
4315 4310
4316 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4311 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4317 return last_nid; 4312 return last_nid;
4318 4313
4319 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4314 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4320 if (nid != -1) { 4315 if (nid != -1) {
4321 last_start_pfn = start_pfn; 4316 last_start_pfn = start_pfn;
4322 last_end_pfn = end_pfn; 4317 last_end_pfn = end_pfn;
4323 last_nid = nid; 4318 last_nid = nid;
4324 } 4319 }
4325 4320
4326 return nid; 4321 return nid;
4327 } 4322 }
4328 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4323 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4329 4324
4330 int __meminit early_pfn_to_nid(unsigned long pfn) 4325 int __meminit early_pfn_to_nid(unsigned long pfn)
4331 { 4326 {
4332 int nid; 4327 int nid;
4333 4328
4334 nid = __early_pfn_to_nid(pfn); 4329 nid = __early_pfn_to_nid(pfn);
4335 if (nid >= 0) 4330 if (nid >= 0)
4336 return nid; 4331 return nid;
4337 /* just returns 0 */ 4332 /* just returns 0 */
4338 return 0; 4333 return 0;
4339 } 4334 }
4340 4335
4341 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4336 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4342 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4337 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4343 { 4338 {
4344 int nid; 4339 int nid;
4345 4340
4346 nid = __early_pfn_to_nid(pfn); 4341 nid = __early_pfn_to_nid(pfn);
4347 if (nid >= 0 && nid != node) 4342 if (nid >= 0 && nid != node)
4348 return false; 4343 return false;
4349 return true; 4344 return true;
4350 } 4345 }
4351 #endif 4346 #endif
4352 4347
4353 /** 4348 /**
4354 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4349 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4355 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4350 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4356 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4351 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4357 * 4352 *
4358 * If an architecture guarantees that all ranges registered with 4353 * If an architecture guarantees that all ranges registered with
4359 * add_active_ranges() contain no holes and may be freed, this 4354 * add_active_ranges() contain no holes and may be freed, this
4360 * this function may be used instead of calling free_bootmem() manually. 4355 * this function may be used instead of calling free_bootmem() manually.
4361 */ 4356 */
4362 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4357 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4363 { 4358 {
4364 unsigned long start_pfn, end_pfn; 4359 unsigned long start_pfn, end_pfn;
4365 int i, this_nid; 4360 int i, this_nid;
4366 4361
4367 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4362 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4368 start_pfn = min(start_pfn, max_low_pfn); 4363 start_pfn = min(start_pfn, max_low_pfn);
4369 end_pfn = min(end_pfn, max_low_pfn); 4364 end_pfn = min(end_pfn, max_low_pfn);
4370 4365
4371 if (start_pfn < end_pfn) 4366 if (start_pfn < end_pfn)
4372 free_bootmem_node(NODE_DATA(this_nid), 4367 free_bootmem_node(NODE_DATA(this_nid),
4373 PFN_PHYS(start_pfn), 4368 PFN_PHYS(start_pfn),
4374 (end_pfn - start_pfn) << PAGE_SHIFT); 4369 (end_pfn - start_pfn) << PAGE_SHIFT);
4375 } 4370 }
4376 } 4371 }
4377 4372
4378 /** 4373 /**
4379 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4374 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4380 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4375 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4381 * 4376 *
4382 * If an architecture guarantees that all ranges registered with 4377 * If an architecture guarantees that all ranges registered with
4383 * add_active_ranges() contain no holes and may be freed, this 4378 * add_active_ranges() contain no holes and may be freed, this
4384 * function may be used instead of calling memory_present() manually. 4379 * function may be used instead of calling memory_present() manually.
4385 */ 4380 */
4386 void __init sparse_memory_present_with_active_regions(int nid) 4381 void __init sparse_memory_present_with_active_regions(int nid)
4387 { 4382 {
4388 unsigned long start_pfn, end_pfn; 4383 unsigned long start_pfn, end_pfn;
4389 int i, this_nid; 4384 int i, this_nid;
4390 4385
4391 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4386 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4392 memory_present(this_nid, start_pfn, end_pfn); 4387 memory_present(this_nid, start_pfn, end_pfn);
4393 } 4388 }
4394 4389
4395 /** 4390 /**
4396 * get_pfn_range_for_nid - Return the start and end page frames for a node 4391 * get_pfn_range_for_nid - Return the start and end page frames for a node
4397 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4392 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4398 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4393 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4399 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4394 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4400 * 4395 *
4401 * It returns the start and end page frame of a node based on information 4396 * It returns the start and end page frame of a node based on information
4402 * provided by an arch calling add_active_range(). If called for a node 4397 * provided by an arch calling add_active_range(). If called for a node
4403 * with no available memory, a warning is printed and the start and end 4398 * with no available memory, a warning is printed and the start and end
4404 * PFNs will be 0. 4399 * PFNs will be 0.
4405 */ 4400 */
4406 void __meminit get_pfn_range_for_nid(unsigned int nid, 4401 void __meminit get_pfn_range_for_nid(unsigned int nid,
4407 unsigned long *start_pfn, unsigned long *end_pfn) 4402 unsigned long *start_pfn, unsigned long *end_pfn)
4408 { 4403 {
4409 unsigned long this_start_pfn, this_end_pfn; 4404 unsigned long this_start_pfn, this_end_pfn;
4410 int i; 4405 int i;
4411 4406
4412 *start_pfn = -1UL; 4407 *start_pfn = -1UL;
4413 *end_pfn = 0; 4408 *end_pfn = 0;
4414 4409
4415 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4410 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4416 *start_pfn = min(*start_pfn, this_start_pfn); 4411 *start_pfn = min(*start_pfn, this_start_pfn);
4417 *end_pfn = max(*end_pfn, this_end_pfn); 4412 *end_pfn = max(*end_pfn, this_end_pfn);
4418 } 4413 }
4419 4414
4420 if (*start_pfn == -1UL) 4415 if (*start_pfn == -1UL)
4421 *start_pfn = 0; 4416 *start_pfn = 0;
4422 } 4417 }
4423 4418
4424 /* 4419 /*
4425 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4420 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4426 * assumption is made that zones within a node are ordered in monotonic 4421 * assumption is made that zones within a node are ordered in monotonic
4427 * increasing memory addresses so that the "highest" populated zone is used 4422 * increasing memory addresses so that the "highest" populated zone is used
4428 */ 4423 */
4429 static void __init find_usable_zone_for_movable(void) 4424 static void __init find_usable_zone_for_movable(void)
4430 { 4425 {
4431 int zone_index; 4426 int zone_index;
4432 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4427 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4433 if (zone_index == ZONE_MOVABLE) 4428 if (zone_index == ZONE_MOVABLE)
4434 continue; 4429 continue;
4435 4430
4436 if (arch_zone_highest_possible_pfn[zone_index] > 4431 if (arch_zone_highest_possible_pfn[zone_index] >
4437 arch_zone_lowest_possible_pfn[zone_index]) 4432 arch_zone_lowest_possible_pfn[zone_index])
4438 break; 4433 break;
4439 } 4434 }
4440 4435
4441 VM_BUG_ON(zone_index == -1); 4436 VM_BUG_ON(zone_index == -1);
4442 movable_zone = zone_index; 4437 movable_zone = zone_index;
4443 } 4438 }
4444 4439
4445 /* 4440 /*
4446 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4441 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4447 * because it is sized independent of architecture. Unlike the other zones, 4442 * because it is sized independent of architecture. Unlike the other zones,
4448 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4443 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4449 * in each node depending on the size of each node and how evenly kernelcore 4444 * in each node depending on the size of each node and how evenly kernelcore
4450 * is distributed. This helper function adjusts the zone ranges 4445 * is distributed. This helper function adjusts the zone ranges
4451 * provided by the architecture for a given node by using the end of the 4446 * provided by the architecture for a given node by using the end of the
4452 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4447 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4453 * zones within a node are in order of monotonic increases memory addresses 4448 * zones within a node are in order of monotonic increases memory addresses
4454 */ 4449 */
4455 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4450 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4456 unsigned long zone_type, 4451 unsigned long zone_type,
4457 unsigned long node_start_pfn, 4452 unsigned long node_start_pfn,
4458 unsigned long node_end_pfn, 4453 unsigned long node_end_pfn,
4459 unsigned long *zone_start_pfn, 4454 unsigned long *zone_start_pfn,
4460 unsigned long *zone_end_pfn) 4455 unsigned long *zone_end_pfn)
4461 { 4456 {
4462 /* Only adjust if ZONE_MOVABLE is on this node */ 4457 /* Only adjust if ZONE_MOVABLE is on this node */
4463 if (zone_movable_pfn[nid]) { 4458 if (zone_movable_pfn[nid]) {
4464 /* Size ZONE_MOVABLE */ 4459 /* Size ZONE_MOVABLE */
4465 if (zone_type == ZONE_MOVABLE) { 4460 if (zone_type == ZONE_MOVABLE) {
4466 *zone_start_pfn = zone_movable_pfn[nid]; 4461 *zone_start_pfn = zone_movable_pfn[nid];
4467 *zone_end_pfn = min(node_end_pfn, 4462 *zone_end_pfn = min(node_end_pfn,
4468 arch_zone_highest_possible_pfn[movable_zone]); 4463 arch_zone_highest_possible_pfn[movable_zone]);
4469 4464
4470 /* Adjust for ZONE_MOVABLE starting within this range */ 4465 /* Adjust for ZONE_MOVABLE starting within this range */
4471 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4466 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4472 *zone_end_pfn > zone_movable_pfn[nid]) { 4467 *zone_end_pfn > zone_movable_pfn[nid]) {
4473 *zone_end_pfn = zone_movable_pfn[nid]; 4468 *zone_end_pfn = zone_movable_pfn[nid];
4474 4469
4475 /* Check if this whole range is within ZONE_MOVABLE */ 4470 /* Check if this whole range is within ZONE_MOVABLE */
4476 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4471 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4477 *zone_start_pfn = *zone_end_pfn; 4472 *zone_start_pfn = *zone_end_pfn;
4478 } 4473 }
4479 } 4474 }
4480 4475
4481 /* 4476 /*
4482 * Return the number of pages a zone spans in a node, including holes 4477 * Return the number of pages a zone spans in a node, including holes
4483 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4478 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4484 */ 4479 */
4485 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4480 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4486 unsigned long zone_type, 4481 unsigned long zone_type,
4487 unsigned long node_start_pfn, 4482 unsigned long node_start_pfn,
4488 unsigned long node_end_pfn, 4483 unsigned long node_end_pfn,
4489 unsigned long *ignored) 4484 unsigned long *ignored)
4490 { 4485 {
4491 unsigned long zone_start_pfn, zone_end_pfn; 4486 unsigned long zone_start_pfn, zone_end_pfn;
4492 4487
4493 /* Get the start and end of the zone */ 4488 /* Get the start and end of the zone */
4494 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4489 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4495 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4490 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4496 adjust_zone_range_for_zone_movable(nid, zone_type, 4491 adjust_zone_range_for_zone_movable(nid, zone_type,
4497 node_start_pfn, node_end_pfn, 4492 node_start_pfn, node_end_pfn,
4498 &zone_start_pfn, &zone_end_pfn); 4493 &zone_start_pfn, &zone_end_pfn);
4499 4494
4500 /* Check that this node has pages within the zone's required range */ 4495 /* Check that this node has pages within the zone's required range */
4501 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4496 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4502 return 0; 4497 return 0;
4503 4498
4504 /* Move the zone boundaries inside the node if necessary */ 4499 /* Move the zone boundaries inside the node if necessary */
4505 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4500 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4506 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4501 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4507 4502
4508 /* Return the spanned pages */ 4503 /* Return the spanned pages */
4509 return zone_end_pfn - zone_start_pfn; 4504 return zone_end_pfn - zone_start_pfn;
4510 } 4505 }
4511 4506
4512 /* 4507 /*
4513 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4508 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4514 * then all holes in the requested range will be accounted for. 4509 * then all holes in the requested range will be accounted for.
4515 */ 4510 */
4516 unsigned long __meminit __absent_pages_in_range(int nid, 4511 unsigned long __meminit __absent_pages_in_range(int nid,
4517 unsigned long range_start_pfn, 4512 unsigned long range_start_pfn,
4518 unsigned long range_end_pfn) 4513 unsigned long range_end_pfn)
4519 { 4514 {
4520 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4515 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4521 unsigned long start_pfn, end_pfn; 4516 unsigned long start_pfn, end_pfn;
4522 int i; 4517 int i;
4523 4518
4524 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4519 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4525 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4520 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4526 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4521 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4527 nr_absent -= end_pfn - start_pfn; 4522 nr_absent -= end_pfn - start_pfn;
4528 } 4523 }
4529 return nr_absent; 4524 return nr_absent;
4530 } 4525 }
4531 4526
4532 /** 4527 /**
4533 * absent_pages_in_range - Return number of page frames in holes within a range 4528 * absent_pages_in_range - Return number of page frames in holes within a range
4534 * @start_pfn: The start PFN to start searching for holes 4529 * @start_pfn: The start PFN to start searching for holes
4535 * @end_pfn: The end PFN to stop searching for holes 4530 * @end_pfn: The end PFN to stop searching for holes
4536 * 4531 *
4537 * It returns the number of pages frames in memory holes within a range. 4532 * It returns the number of pages frames in memory holes within a range.
4538 */ 4533 */
4539 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4534 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4540 unsigned long end_pfn) 4535 unsigned long end_pfn)
4541 { 4536 {
4542 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4537 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4543 } 4538 }
4544 4539
4545 /* Return the number of page frames in holes in a zone on a node */ 4540 /* Return the number of page frames in holes in a zone on a node */
4546 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4541 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4547 unsigned long zone_type, 4542 unsigned long zone_type,
4548 unsigned long node_start_pfn, 4543 unsigned long node_start_pfn,
4549 unsigned long node_end_pfn, 4544 unsigned long node_end_pfn,
4550 unsigned long *ignored) 4545 unsigned long *ignored)
4551 { 4546 {
4552 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4547 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4553 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4548 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4554 unsigned long zone_start_pfn, zone_end_pfn; 4549 unsigned long zone_start_pfn, zone_end_pfn;
4555 4550
4556 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4551 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4557 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4552 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4558 4553
4559 adjust_zone_range_for_zone_movable(nid, zone_type, 4554 adjust_zone_range_for_zone_movable(nid, zone_type,
4560 node_start_pfn, node_end_pfn, 4555 node_start_pfn, node_end_pfn,
4561 &zone_start_pfn, &zone_end_pfn); 4556 &zone_start_pfn, &zone_end_pfn);
4562 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4557 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4563 } 4558 }
4564 4559
4565 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4560 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4566 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4561 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4567 unsigned long zone_type, 4562 unsigned long zone_type,
4568 unsigned long node_start_pfn, 4563 unsigned long node_start_pfn,
4569 unsigned long node_end_pfn, 4564 unsigned long node_end_pfn,
4570 unsigned long *zones_size) 4565 unsigned long *zones_size)
4571 { 4566 {
4572 return zones_size[zone_type]; 4567 return zones_size[zone_type];
4573 } 4568 }
4574 4569
4575 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4570 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4576 unsigned long zone_type, 4571 unsigned long zone_type,
4577 unsigned long node_start_pfn, 4572 unsigned long node_start_pfn,
4578 unsigned long node_end_pfn, 4573 unsigned long node_end_pfn,
4579 unsigned long *zholes_size) 4574 unsigned long *zholes_size)
4580 { 4575 {
4581 if (!zholes_size) 4576 if (!zholes_size)
4582 return 0; 4577 return 0;
4583 4578
4584 return zholes_size[zone_type]; 4579 return zholes_size[zone_type];
4585 } 4580 }
4586 4581
4587 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4582 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4588 4583
4589 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4584 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4590 unsigned long node_start_pfn, 4585 unsigned long node_start_pfn,
4591 unsigned long node_end_pfn, 4586 unsigned long node_end_pfn,
4592 unsigned long *zones_size, 4587 unsigned long *zones_size,
4593 unsigned long *zholes_size) 4588 unsigned long *zholes_size)
4594 { 4589 {
4595 unsigned long realtotalpages, totalpages = 0; 4590 unsigned long realtotalpages, totalpages = 0;
4596 enum zone_type i; 4591 enum zone_type i;
4597 4592
4598 for (i = 0; i < MAX_NR_ZONES; i++) 4593 for (i = 0; i < MAX_NR_ZONES; i++)
4599 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4594 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4600 node_start_pfn, 4595 node_start_pfn,
4601 node_end_pfn, 4596 node_end_pfn,
4602 zones_size); 4597 zones_size);
4603 pgdat->node_spanned_pages = totalpages; 4598 pgdat->node_spanned_pages = totalpages;
4604 4599
4605 realtotalpages = totalpages; 4600 realtotalpages = totalpages;
4606 for (i = 0; i < MAX_NR_ZONES; i++) 4601 for (i = 0; i < MAX_NR_ZONES; i++)
4607 realtotalpages -= 4602 realtotalpages -=
4608 zone_absent_pages_in_node(pgdat->node_id, i, 4603 zone_absent_pages_in_node(pgdat->node_id, i,
4609 node_start_pfn, node_end_pfn, 4604 node_start_pfn, node_end_pfn,
4610 zholes_size); 4605 zholes_size);
4611 pgdat->node_present_pages = realtotalpages; 4606 pgdat->node_present_pages = realtotalpages;
4612 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4607 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4613 realtotalpages); 4608 realtotalpages);
4614 } 4609 }
4615 4610
4616 #ifndef CONFIG_SPARSEMEM 4611 #ifndef CONFIG_SPARSEMEM
4617 /* 4612 /*
4618 * Calculate the size of the zone->blockflags rounded to an unsigned long 4613 * Calculate the size of the zone->blockflags rounded to an unsigned long
4619 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4614 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4620 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4615 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4621 * round what is now in bits to nearest long in bits, then return it in 4616 * round what is now in bits to nearest long in bits, then return it in
4622 * bytes. 4617 * bytes.
4623 */ 4618 */
4624 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4619 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4625 { 4620 {
4626 unsigned long usemapsize; 4621 unsigned long usemapsize;
4627 4622
4628 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4623 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4629 usemapsize = roundup(zonesize, pageblock_nr_pages); 4624 usemapsize = roundup(zonesize, pageblock_nr_pages);
4630 usemapsize = usemapsize >> pageblock_order; 4625 usemapsize = usemapsize >> pageblock_order;
4631 usemapsize *= NR_PAGEBLOCK_BITS; 4626 usemapsize *= NR_PAGEBLOCK_BITS;
4632 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4627 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4633 4628
4634 return usemapsize / 8; 4629 return usemapsize / 8;
4635 } 4630 }
4636 4631
4637 static void __init setup_usemap(struct pglist_data *pgdat, 4632 static void __init setup_usemap(struct pglist_data *pgdat,
4638 struct zone *zone, 4633 struct zone *zone,
4639 unsigned long zone_start_pfn, 4634 unsigned long zone_start_pfn,
4640 unsigned long zonesize) 4635 unsigned long zonesize)
4641 { 4636 {
4642 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4637 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4643 zone->pageblock_flags = NULL; 4638 zone->pageblock_flags = NULL;
4644 if (usemapsize) 4639 if (usemapsize)
4645 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4640 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4646 usemapsize); 4641 usemapsize);
4647 } 4642 }
4648 #else 4643 #else
4649 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4644 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4650 unsigned long zone_start_pfn, unsigned long zonesize) {} 4645 unsigned long zone_start_pfn, unsigned long zonesize) {}
4651 #endif /* CONFIG_SPARSEMEM */ 4646 #endif /* CONFIG_SPARSEMEM */
4652 4647
4653 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4648 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4654 4649
4655 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4650 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4656 void __paginginit set_pageblock_order(void) 4651 void __paginginit set_pageblock_order(void)
4657 { 4652 {
4658 unsigned int order; 4653 unsigned int order;
4659 4654
4660 /* Check that pageblock_nr_pages has not already been setup */ 4655 /* Check that pageblock_nr_pages has not already been setup */
4661 if (pageblock_order) 4656 if (pageblock_order)
4662 return; 4657 return;
4663 4658
4664 if (HPAGE_SHIFT > PAGE_SHIFT) 4659 if (HPAGE_SHIFT > PAGE_SHIFT)
4665 order = HUGETLB_PAGE_ORDER; 4660 order = HUGETLB_PAGE_ORDER;
4666 else 4661 else
4667 order = MAX_ORDER - 1; 4662 order = MAX_ORDER - 1;
4668 4663
4669 /* 4664 /*
4670 * Assume the largest contiguous order of interest is a huge page. 4665 * Assume the largest contiguous order of interest is a huge page.
4671 * This value may be variable depending on boot parameters on IA64 and 4666 * This value may be variable depending on boot parameters on IA64 and
4672 * powerpc. 4667 * powerpc.
4673 */ 4668 */
4674 pageblock_order = order; 4669 pageblock_order = order;
4675 } 4670 }
4676 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4671 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4677 4672
4678 /* 4673 /*
4679 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4674 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4680 * is unused as pageblock_order is set at compile-time. See 4675 * is unused as pageblock_order is set at compile-time. See
4681 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4676 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4682 * the kernel config 4677 * the kernel config
4683 */ 4678 */
4684 void __paginginit set_pageblock_order(void) 4679 void __paginginit set_pageblock_order(void)
4685 { 4680 {
4686 } 4681 }
4687 4682
4688 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4683 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4689 4684
4690 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4685 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4691 unsigned long present_pages) 4686 unsigned long present_pages)
4692 { 4687 {
4693 unsigned long pages = spanned_pages; 4688 unsigned long pages = spanned_pages;
4694 4689
4695 /* 4690 /*
4696 * Provide a more accurate estimation if there are holes within 4691 * Provide a more accurate estimation if there are holes within
4697 * the zone and SPARSEMEM is in use. If there are holes within the 4692 * the zone and SPARSEMEM is in use. If there are holes within the
4698 * zone, each populated memory region may cost us one or two extra 4693 * zone, each populated memory region may cost us one or two extra
4699 * memmap pages due to alignment because memmap pages for each 4694 * memmap pages due to alignment because memmap pages for each
4700 * populated regions may not naturally algined on page boundary. 4695 * populated regions may not naturally algined on page boundary.
4701 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4696 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4702 */ 4697 */
4703 if (spanned_pages > present_pages + (present_pages >> 4) && 4698 if (spanned_pages > present_pages + (present_pages >> 4) &&
4704 IS_ENABLED(CONFIG_SPARSEMEM)) 4699 IS_ENABLED(CONFIG_SPARSEMEM))
4705 pages = present_pages; 4700 pages = present_pages;
4706 4701
4707 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4702 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4708 } 4703 }
4709 4704
4710 /* 4705 /*
4711 * Set up the zone data structures: 4706 * Set up the zone data structures:
4712 * - mark all pages reserved 4707 * - mark all pages reserved
4713 * - mark all memory queues empty 4708 * - mark all memory queues empty
4714 * - clear the memory bitmaps 4709 * - clear the memory bitmaps
4715 * 4710 *
4716 * NOTE: pgdat should get zeroed by caller. 4711 * NOTE: pgdat should get zeroed by caller.
4717 */ 4712 */
4718 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4713 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4719 unsigned long node_start_pfn, unsigned long node_end_pfn, 4714 unsigned long node_start_pfn, unsigned long node_end_pfn,
4720 unsigned long *zones_size, unsigned long *zholes_size) 4715 unsigned long *zones_size, unsigned long *zholes_size)
4721 { 4716 {
4722 enum zone_type j; 4717 enum zone_type j;
4723 int nid = pgdat->node_id; 4718 int nid = pgdat->node_id;
4724 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4719 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4725 int ret; 4720 int ret;
4726 4721
4727 pgdat_resize_init(pgdat); 4722 pgdat_resize_init(pgdat);
4728 #ifdef CONFIG_NUMA_BALANCING 4723 #ifdef CONFIG_NUMA_BALANCING
4729 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4724 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4730 pgdat->numabalancing_migrate_nr_pages = 0; 4725 pgdat->numabalancing_migrate_nr_pages = 0;
4731 pgdat->numabalancing_migrate_next_window = jiffies; 4726 pgdat->numabalancing_migrate_next_window = jiffies;
4732 #endif 4727 #endif
4733 init_waitqueue_head(&pgdat->kswapd_wait); 4728 init_waitqueue_head(&pgdat->kswapd_wait);
4734 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4729 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4735 pgdat_page_cgroup_init(pgdat); 4730 pgdat_page_cgroup_init(pgdat);
4736 4731
4737 for (j = 0; j < MAX_NR_ZONES; j++) { 4732 for (j = 0; j < MAX_NR_ZONES; j++) {
4738 struct zone *zone = pgdat->node_zones + j; 4733 struct zone *zone = pgdat->node_zones + j;
4739 unsigned long size, realsize, freesize, memmap_pages; 4734 unsigned long size, realsize, freesize, memmap_pages;
4740 4735
4741 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4736 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4742 node_end_pfn, zones_size); 4737 node_end_pfn, zones_size);
4743 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4738 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4744 node_start_pfn, 4739 node_start_pfn,
4745 node_end_pfn, 4740 node_end_pfn,
4746 zholes_size); 4741 zholes_size);
4747 4742
4748 /* 4743 /*
4749 * Adjust freesize so that it accounts for how much memory 4744 * Adjust freesize so that it accounts for how much memory
4750 * is used by this zone for memmap. This affects the watermark 4745 * is used by this zone for memmap. This affects the watermark
4751 * and per-cpu initialisations 4746 * and per-cpu initialisations
4752 */ 4747 */
4753 memmap_pages = calc_memmap_size(size, realsize); 4748 memmap_pages = calc_memmap_size(size, realsize);
4754 if (freesize >= memmap_pages) { 4749 if (freesize >= memmap_pages) {
4755 freesize -= memmap_pages; 4750 freesize -= memmap_pages;
4756 if (memmap_pages) 4751 if (memmap_pages)
4757 printk(KERN_DEBUG 4752 printk(KERN_DEBUG
4758 " %s zone: %lu pages used for memmap\n", 4753 " %s zone: %lu pages used for memmap\n",
4759 zone_names[j], memmap_pages); 4754 zone_names[j], memmap_pages);
4760 } else 4755 } else
4761 printk(KERN_WARNING 4756 printk(KERN_WARNING
4762 " %s zone: %lu pages exceeds freesize %lu\n", 4757 " %s zone: %lu pages exceeds freesize %lu\n",
4763 zone_names[j], memmap_pages, freesize); 4758 zone_names[j], memmap_pages, freesize);
4764 4759
4765 /* Account for reserved pages */ 4760 /* Account for reserved pages */
4766 if (j == 0 && freesize > dma_reserve) { 4761 if (j == 0 && freesize > dma_reserve) {
4767 freesize -= dma_reserve; 4762 freesize -= dma_reserve;
4768 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4763 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4769 zone_names[0], dma_reserve); 4764 zone_names[0], dma_reserve);
4770 } 4765 }
4771 4766
4772 if (!is_highmem_idx(j)) 4767 if (!is_highmem_idx(j))
4773 nr_kernel_pages += freesize; 4768 nr_kernel_pages += freesize;
4774 /* Charge for highmem memmap if there are enough kernel pages */ 4769 /* Charge for highmem memmap if there are enough kernel pages */
4775 else if (nr_kernel_pages > memmap_pages * 2) 4770 else if (nr_kernel_pages > memmap_pages * 2)
4776 nr_kernel_pages -= memmap_pages; 4771 nr_kernel_pages -= memmap_pages;
4777 nr_all_pages += freesize; 4772 nr_all_pages += freesize;
4778 4773
4779 zone->spanned_pages = size; 4774 zone->spanned_pages = size;
4780 zone->present_pages = realsize; 4775 zone->present_pages = realsize;
4781 /* 4776 /*
4782 * Set an approximate value for lowmem here, it will be adjusted 4777 * Set an approximate value for lowmem here, it will be adjusted
4783 * when the bootmem allocator frees pages into the buddy system. 4778 * when the bootmem allocator frees pages into the buddy system.
4784 * And all highmem pages will be managed by the buddy system. 4779 * And all highmem pages will be managed by the buddy system.
4785 */ 4780 */
4786 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4781 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4787 #ifdef CONFIG_NUMA 4782 #ifdef CONFIG_NUMA
4788 zone->node = nid; 4783 zone->node = nid;
4789 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4784 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4790 / 100; 4785 / 100;
4791 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4786 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4792 #endif 4787 #endif
4793 zone->name = zone_names[j]; 4788 zone->name = zone_names[j];
4794 spin_lock_init(&zone->lock); 4789 spin_lock_init(&zone->lock);
4795 spin_lock_init(&zone->lru_lock); 4790 spin_lock_init(&zone->lru_lock);
4796 zone_seqlock_init(zone); 4791 zone_seqlock_init(zone);
4797 zone->zone_pgdat = pgdat; 4792 zone->zone_pgdat = pgdat;
4798 zone_pcp_init(zone); 4793 zone_pcp_init(zone);
4799 4794
4800 /* For bootup, initialized properly in watermark setup */ 4795 /* For bootup, initialized properly in watermark setup */
4801 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4796 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4802 4797
4803 lruvec_init(&zone->lruvec); 4798 lruvec_init(&zone->lruvec);
4804 if (!size) 4799 if (!size)
4805 continue; 4800 continue;
4806 4801
4807 set_pageblock_order(); 4802 set_pageblock_order();
4808 setup_usemap(pgdat, zone, zone_start_pfn, size); 4803 setup_usemap(pgdat, zone, zone_start_pfn, size);
4809 ret = init_currently_empty_zone(zone, zone_start_pfn, 4804 ret = init_currently_empty_zone(zone, zone_start_pfn,
4810 size, MEMMAP_EARLY); 4805 size, MEMMAP_EARLY);
4811 BUG_ON(ret); 4806 BUG_ON(ret);
4812 memmap_init(size, nid, j, zone_start_pfn); 4807 memmap_init(size, nid, j, zone_start_pfn);
4813 zone_start_pfn += size; 4808 zone_start_pfn += size;
4814 } 4809 }
4815 } 4810 }
4816 4811
4817 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4812 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4818 { 4813 {
4819 /* Skip empty nodes */ 4814 /* Skip empty nodes */
4820 if (!pgdat->node_spanned_pages) 4815 if (!pgdat->node_spanned_pages)
4821 return; 4816 return;
4822 4817
4823 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4818 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4824 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4819 /* ia64 gets its own node_mem_map, before this, without bootmem */
4825 if (!pgdat->node_mem_map) { 4820 if (!pgdat->node_mem_map) {
4826 unsigned long size, start, end; 4821 unsigned long size, start, end;
4827 struct page *map; 4822 struct page *map;
4828 4823
4829 /* 4824 /*
4830 * The zone's endpoints aren't required to be MAX_ORDER 4825 * The zone's endpoints aren't required to be MAX_ORDER
4831 * aligned but the node_mem_map endpoints must be in order 4826 * aligned but the node_mem_map endpoints must be in order
4832 * for the buddy allocator to function correctly. 4827 * for the buddy allocator to function correctly.
4833 */ 4828 */
4834 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4829 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4835 end = pgdat_end_pfn(pgdat); 4830 end = pgdat_end_pfn(pgdat);
4836 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4831 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4837 size = (end - start) * sizeof(struct page); 4832 size = (end - start) * sizeof(struct page);
4838 map = alloc_remap(pgdat->node_id, size); 4833 map = alloc_remap(pgdat->node_id, size);
4839 if (!map) 4834 if (!map)
4840 map = alloc_bootmem_node_nopanic(pgdat, size); 4835 map = alloc_bootmem_node_nopanic(pgdat, size);
4841 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4836 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4842 } 4837 }
4843 #ifndef CONFIG_NEED_MULTIPLE_NODES 4838 #ifndef CONFIG_NEED_MULTIPLE_NODES
4844 /* 4839 /*
4845 * With no DISCONTIG, the global mem_map is just set as node 0's 4840 * With no DISCONTIG, the global mem_map is just set as node 0's
4846 */ 4841 */
4847 if (pgdat == NODE_DATA(0)) { 4842 if (pgdat == NODE_DATA(0)) {
4848 mem_map = NODE_DATA(0)->node_mem_map; 4843 mem_map = NODE_DATA(0)->node_mem_map;
4849 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4844 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4850 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4845 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4851 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4846 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4852 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4847 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4853 } 4848 }
4854 #endif 4849 #endif
4855 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4850 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4856 } 4851 }
4857 4852
4858 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4853 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4859 unsigned long node_start_pfn, unsigned long *zholes_size) 4854 unsigned long node_start_pfn, unsigned long *zholes_size)
4860 { 4855 {
4861 pg_data_t *pgdat = NODE_DATA(nid); 4856 pg_data_t *pgdat = NODE_DATA(nid);
4862 unsigned long start_pfn = 0; 4857 unsigned long start_pfn = 0;
4863 unsigned long end_pfn = 0; 4858 unsigned long end_pfn = 0;
4864 4859
4865 /* pg_data_t should be reset to zero when it's allocated */ 4860 /* pg_data_t should be reset to zero when it's allocated */
4866 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4861 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4867 4862
4868 pgdat->node_id = nid; 4863 pgdat->node_id = nid;
4869 pgdat->node_start_pfn = node_start_pfn; 4864 pgdat->node_start_pfn = node_start_pfn;
4870 init_zone_allows_reclaim(nid); 4865 init_zone_allows_reclaim(nid);
4871 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4866 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4872 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4867 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4873 #endif 4868 #endif
4874 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4869 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4875 zones_size, zholes_size); 4870 zones_size, zholes_size);
4876 4871
4877 alloc_node_mem_map(pgdat); 4872 alloc_node_mem_map(pgdat);
4878 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4873 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4879 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4874 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4880 nid, (unsigned long)pgdat, 4875 nid, (unsigned long)pgdat,
4881 (unsigned long)pgdat->node_mem_map); 4876 (unsigned long)pgdat->node_mem_map);
4882 #endif 4877 #endif
4883 4878
4884 free_area_init_core(pgdat, start_pfn, end_pfn, 4879 free_area_init_core(pgdat, start_pfn, end_pfn,
4885 zones_size, zholes_size); 4880 zones_size, zholes_size);
4886 } 4881 }
4887 4882
4888 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4883 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4889 4884
4890 #if MAX_NUMNODES > 1 4885 #if MAX_NUMNODES > 1
4891 /* 4886 /*
4892 * Figure out the number of possible node ids. 4887 * Figure out the number of possible node ids.
4893 */ 4888 */
4894 void __init setup_nr_node_ids(void) 4889 void __init setup_nr_node_ids(void)
4895 { 4890 {
4896 unsigned int node; 4891 unsigned int node;
4897 unsigned int highest = 0; 4892 unsigned int highest = 0;
4898 4893
4899 for_each_node_mask(node, node_possible_map) 4894 for_each_node_mask(node, node_possible_map)
4900 highest = node; 4895 highest = node;
4901 nr_node_ids = highest + 1; 4896 nr_node_ids = highest + 1;
4902 } 4897 }
4903 #endif 4898 #endif
4904 4899
4905 /** 4900 /**
4906 * node_map_pfn_alignment - determine the maximum internode alignment 4901 * node_map_pfn_alignment - determine the maximum internode alignment
4907 * 4902 *
4908 * This function should be called after node map is populated and sorted. 4903 * This function should be called after node map is populated and sorted.
4909 * It calculates the maximum power of two alignment which can distinguish 4904 * It calculates the maximum power of two alignment which can distinguish
4910 * all the nodes. 4905 * all the nodes.
4911 * 4906 *
4912 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4907 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4913 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4908 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4914 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4909 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4915 * shifted, 1GiB is enough and this function will indicate so. 4910 * shifted, 1GiB is enough and this function will indicate so.
4916 * 4911 *
4917 * This is used to test whether pfn -> nid mapping of the chosen memory 4912 * This is used to test whether pfn -> nid mapping of the chosen memory
4918 * model has fine enough granularity to avoid incorrect mapping for the 4913 * model has fine enough granularity to avoid incorrect mapping for the
4919 * populated node map. 4914 * populated node map.
4920 * 4915 *
4921 * Returns the determined alignment in pfn's. 0 if there is no alignment 4916 * Returns the determined alignment in pfn's. 0 if there is no alignment
4922 * requirement (single node). 4917 * requirement (single node).
4923 */ 4918 */
4924 unsigned long __init node_map_pfn_alignment(void) 4919 unsigned long __init node_map_pfn_alignment(void)
4925 { 4920 {
4926 unsigned long accl_mask = 0, last_end = 0; 4921 unsigned long accl_mask = 0, last_end = 0;
4927 unsigned long start, end, mask; 4922 unsigned long start, end, mask;
4928 int last_nid = -1; 4923 int last_nid = -1;
4929 int i, nid; 4924 int i, nid;
4930 4925
4931 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4926 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4932 if (!start || last_nid < 0 || last_nid == nid) { 4927 if (!start || last_nid < 0 || last_nid == nid) {
4933 last_nid = nid; 4928 last_nid = nid;
4934 last_end = end; 4929 last_end = end;
4935 continue; 4930 continue;
4936 } 4931 }
4937 4932
4938 /* 4933 /*
4939 * Start with a mask granular enough to pin-point to the 4934 * Start with a mask granular enough to pin-point to the
4940 * start pfn and tick off bits one-by-one until it becomes 4935 * start pfn and tick off bits one-by-one until it becomes
4941 * too coarse to separate the current node from the last. 4936 * too coarse to separate the current node from the last.
4942 */ 4937 */
4943 mask = ~((1 << __ffs(start)) - 1); 4938 mask = ~((1 << __ffs(start)) - 1);
4944 while (mask && last_end <= (start & (mask << 1))) 4939 while (mask && last_end <= (start & (mask << 1)))
4945 mask <<= 1; 4940 mask <<= 1;
4946 4941
4947 /* accumulate all internode masks */ 4942 /* accumulate all internode masks */
4948 accl_mask |= mask; 4943 accl_mask |= mask;
4949 } 4944 }
4950 4945
4951 /* convert mask to number of pages */ 4946 /* convert mask to number of pages */
4952 return ~accl_mask + 1; 4947 return ~accl_mask + 1;
4953 } 4948 }
4954 4949
4955 /* Find the lowest pfn for a node */ 4950 /* Find the lowest pfn for a node */
4956 static unsigned long __init find_min_pfn_for_node(int nid) 4951 static unsigned long __init find_min_pfn_for_node(int nid)
4957 { 4952 {
4958 unsigned long min_pfn = ULONG_MAX; 4953 unsigned long min_pfn = ULONG_MAX;
4959 unsigned long start_pfn; 4954 unsigned long start_pfn;
4960 int i; 4955 int i;
4961 4956
4962 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 4957 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4963 min_pfn = min(min_pfn, start_pfn); 4958 min_pfn = min(min_pfn, start_pfn);
4964 4959
4965 if (min_pfn == ULONG_MAX) { 4960 if (min_pfn == ULONG_MAX) {
4966 printk(KERN_WARNING 4961 printk(KERN_WARNING
4967 "Could not find start_pfn for node %d\n", nid); 4962 "Could not find start_pfn for node %d\n", nid);
4968 return 0; 4963 return 0;
4969 } 4964 }
4970 4965
4971 return min_pfn; 4966 return min_pfn;
4972 } 4967 }
4973 4968
4974 /** 4969 /**
4975 * find_min_pfn_with_active_regions - Find the minimum PFN registered 4970 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4976 * 4971 *
4977 * It returns the minimum PFN based on information provided via 4972 * It returns the minimum PFN based on information provided via
4978 * add_active_range(). 4973 * add_active_range().
4979 */ 4974 */
4980 unsigned long __init find_min_pfn_with_active_regions(void) 4975 unsigned long __init find_min_pfn_with_active_regions(void)
4981 { 4976 {
4982 return find_min_pfn_for_node(MAX_NUMNODES); 4977 return find_min_pfn_for_node(MAX_NUMNODES);
4983 } 4978 }
4984 4979
4985 /* 4980 /*
4986 * early_calculate_totalpages() 4981 * early_calculate_totalpages()
4987 * Sum pages in active regions for movable zone. 4982 * Sum pages in active regions for movable zone.
4988 * Populate N_MEMORY for calculating usable_nodes. 4983 * Populate N_MEMORY for calculating usable_nodes.
4989 */ 4984 */
4990 static unsigned long __init early_calculate_totalpages(void) 4985 static unsigned long __init early_calculate_totalpages(void)
4991 { 4986 {
4992 unsigned long totalpages = 0; 4987 unsigned long totalpages = 0;
4993 unsigned long start_pfn, end_pfn; 4988 unsigned long start_pfn, end_pfn;
4994 int i, nid; 4989 int i, nid;
4995 4990
4996 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 4991 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4997 unsigned long pages = end_pfn - start_pfn; 4992 unsigned long pages = end_pfn - start_pfn;
4998 4993
4999 totalpages += pages; 4994 totalpages += pages;
5000 if (pages) 4995 if (pages)
5001 node_set_state(nid, N_MEMORY); 4996 node_set_state(nid, N_MEMORY);
5002 } 4997 }
5003 return totalpages; 4998 return totalpages;
5004 } 4999 }
5005 5000
5006 /* 5001 /*
5007 * Find the PFN the Movable zone begins in each node. Kernel memory 5002 * Find the PFN the Movable zone begins in each node. Kernel memory
5008 * is spread evenly between nodes as long as the nodes have enough 5003 * is spread evenly between nodes as long as the nodes have enough
5009 * memory. When they don't, some nodes will have more kernelcore than 5004 * memory. When they don't, some nodes will have more kernelcore than
5010 * others 5005 * others
5011 */ 5006 */
5012 static void __init find_zone_movable_pfns_for_nodes(void) 5007 static void __init find_zone_movable_pfns_for_nodes(void)
5013 { 5008 {
5014 int i, nid; 5009 int i, nid;
5015 unsigned long usable_startpfn; 5010 unsigned long usable_startpfn;
5016 unsigned long kernelcore_node, kernelcore_remaining; 5011 unsigned long kernelcore_node, kernelcore_remaining;
5017 /* save the state before borrow the nodemask */ 5012 /* save the state before borrow the nodemask */
5018 nodemask_t saved_node_state = node_states[N_MEMORY]; 5013 nodemask_t saved_node_state = node_states[N_MEMORY];
5019 unsigned long totalpages = early_calculate_totalpages(); 5014 unsigned long totalpages = early_calculate_totalpages();
5020 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5015 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5021 5016
5022 /* 5017 /*
5023 * If movablecore was specified, calculate what size of 5018 * If movablecore was specified, calculate what size of
5024 * kernelcore that corresponds so that memory usable for 5019 * kernelcore that corresponds so that memory usable for
5025 * any allocation type is evenly spread. If both kernelcore 5020 * any allocation type is evenly spread. If both kernelcore
5026 * and movablecore are specified, then the value of kernelcore 5021 * and movablecore are specified, then the value of kernelcore
5027 * will be used for required_kernelcore if it's greater than 5022 * will be used for required_kernelcore if it's greater than
5028 * what movablecore would have allowed. 5023 * what movablecore would have allowed.
5029 */ 5024 */
5030 if (required_movablecore) { 5025 if (required_movablecore) {
5031 unsigned long corepages; 5026 unsigned long corepages;
5032 5027
5033 /* 5028 /*
5034 * Round-up so that ZONE_MOVABLE is at least as large as what 5029 * Round-up so that ZONE_MOVABLE is at least as large as what
5035 * was requested by the user 5030 * was requested by the user
5036 */ 5031 */
5037 required_movablecore = 5032 required_movablecore =
5038 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5033 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5039 corepages = totalpages - required_movablecore; 5034 corepages = totalpages - required_movablecore;
5040 5035
5041 required_kernelcore = max(required_kernelcore, corepages); 5036 required_kernelcore = max(required_kernelcore, corepages);
5042 } 5037 }
5043 5038
5044 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5039 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5045 if (!required_kernelcore) 5040 if (!required_kernelcore)
5046 goto out; 5041 goto out;
5047 5042
5048 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5043 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5049 find_usable_zone_for_movable(); 5044 find_usable_zone_for_movable();
5050 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5045 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5051 5046
5052 restart: 5047 restart:
5053 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5048 /* Spread kernelcore memory as evenly as possible throughout nodes */
5054 kernelcore_node = required_kernelcore / usable_nodes; 5049 kernelcore_node = required_kernelcore / usable_nodes;
5055 for_each_node_state(nid, N_MEMORY) { 5050 for_each_node_state(nid, N_MEMORY) {
5056 unsigned long start_pfn, end_pfn; 5051 unsigned long start_pfn, end_pfn;
5057 5052
5058 /* 5053 /*
5059 * Recalculate kernelcore_node if the division per node 5054 * Recalculate kernelcore_node if the division per node
5060 * now exceeds what is necessary to satisfy the requested 5055 * now exceeds what is necessary to satisfy the requested
5061 * amount of memory for the kernel 5056 * amount of memory for the kernel
5062 */ 5057 */
5063 if (required_kernelcore < kernelcore_node) 5058 if (required_kernelcore < kernelcore_node)
5064 kernelcore_node = required_kernelcore / usable_nodes; 5059 kernelcore_node = required_kernelcore / usable_nodes;
5065 5060
5066 /* 5061 /*
5067 * As the map is walked, we track how much memory is usable 5062 * As the map is walked, we track how much memory is usable
5068 * by the kernel using kernelcore_remaining. When it is 5063 * by the kernel using kernelcore_remaining. When it is
5069 * 0, the rest of the node is usable by ZONE_MOVABLE 5064 * 0, the rest of the node is usable by ZONE_MOVABLE
5070 */ 5065 */
5071 kernelcore_remaining = kernelcore_node; 5066 kernelcore_remaining = kernelcore_node;
5072 5067
5073 /* Go through each range of PFNs within this node */ 5068 /* Go through each range of PFNs within this node */
5074 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5069 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5075 unsigned long size_pages; 5070 unsigned long size_pages;
5076 5071
5077 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5072 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5078 if (start_pfn >= end_pfn) 5073 if (start_pfn >= end_pfn)
5079 continue; 5074 continue;
5080 5075
5081 /* Account for what is only usable for kernelcore */ 5076 /* Account for what is only usable for kernelcore */
5082 if (start_pfn < usable_startpfn) { 5077 if (start_pfn < usable_startpfn) {
5083 unsigned long kernel_pages; 5078 unsigned long kernel_pages;
5084 kernel_pages = min(end_pfn, usable_startpfn) 5079 kernel_pages = min(end_pfn, usable_startpfn)
5085 - start_pfn; 5080 - start_pfn;
5086 5081
5087 kernelcore_remaining -= min(kernel_pages, 5082 kernelcore_remaining -= min(kernel_pages,
5088 kernelcore_remaining); 5083 kernelcore_remaining);
5089 required_kernelcore -= min(kernel_pages, 5084 required_kernelcore -= min(kernel_pages,
5090 required_kernelcore); 5085 required_kernelcore);
5091 5086
5092 /* Continue if range is now fully accounted */ 5087 /* Continue if range is now fully accounted */
5093 if (end_pfn <= usable_startpfn) { 5088 if (end_pfn <= usable_startpfn) {
5094 5089
5095 /* 5090 /*
5096 * Push zone_movable_pfn to the end so 5091 * Push zone_movable_pfn to the end so
5097 * that if we have to rebalance 5092 * that if we have to rebalance
5098 * kernelcore across nodes, we will 5093 * kernelcore across nodes, we will
5099 * not double account here 5094 * not double account here
5100 */ 5095 */
5101 zone_movable_pfn[nid] = end_pfn; 5096 zone_movable_pfn[nid] = end_pfn;
5102 continue; 5097 continue;
5103 } 5098 }
5104 start_pfn = usable_startpfn; 5099 start_pfn = usable_startpfn;
5105 } 5100 }
5106 5101
5107 /* 5102 /*
5108 * The usable PFN range for ZONE_MOVABLE is from 5103 * The usable PFN range for ZONE_MOVABLE is from
5109 * start_pfn->end_pfn. Calculate size_pages as the 5104 * start_pfn->end_pfn. Calculate size_pages as the
5110 * number of pages used as kernelcore 5105 * number of pages used as kernelcore
5111 */ 5106 */
5112 size_pages = end_pfn - start_pfn; 5107 size_pages = end_pfn - start_pfn;
5113 if (size_pages > kernelcore_remaining) 5108 if (size_pages > kernelcore_remaining)
5114 size_pages = kernelcore_remaining; 5109 size_pages = kernelcore_remaining;
5115 zone_movable_pfn[nid] = start_pfn + size_pages; 5110 zone_movable_pfn[nid] = start_pfn + size_pages;
5116 5111
5117 /* 5112 /*
5118 * Some kernelcore has been met, update counts and 5113 * Some kernelcore has been met, update counts and
5119 * break if the kernelcore for this node has been 5114 * break if the kernelcore for this node has been
5120 * satisfied 5115 * satisfied
5121 */ 5116 */
5122 required_kernelcore -= min(required_kernelcore, 5117 required_kernelcore -= min(required_kernelcore,
5123 size_pages); 5118 size_pages);
5124 kernelcore_remaining -= size_pages; 5119 kernelcore_remaining -= size_pages;
5125 if (!kernelcore_remaining) 5120 if (!kernelcore_remaining)
5126 break; 5121 break;
5127 } 5122 }
5128 } 5123 }
5129 5124
5130 /* 5125 /*
5131 * If there is still required_kernelcore, we do another pass with one 5126 * If there is still required_kernelcore, we do another pass with one
5132 * less node in the count. This will push zone_movable_pfn[nid] further 5127 * less node in the count. This will push zone_movable_pfn[nid] further
5133 * along on the nodes that still have memory until kernelcore is 5128 * along on the nodes that still have memory until kernelcore is
5134 * satisfied 5129 * satisfied
5135 */ 5130 */
5136 usable_nodes--; 5131 usable_nodes--;
5137 if (usable_nodes && required_kernelcore > usable_nodes) 5132 if (usable_nodes && required_kernelcore > usable_nodes)
5138 goto restart; 5133 goto restart;
5139 5134
5140 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5135 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5141 for (nid = 0; nid < MAX_NUMNODES; nid++) 5136 for (nid = 0; nid < MAX_NUMNODES; nid++)
5142 zone_movable_pfn[nid] = 5137 zone_movable_pfn[nid] =
5143 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5138 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5144 5139
5145 out: 5140 out:
5146 /* restore the node_state */ 5141 /* restore the node_state */
5147 node_states[N_MEMORY] = saved_node_state; 5142 node_states[N_MEMORY] = saved_node_state;
5148 } 5143 }
5149 5144
5150 /* Any regular or high memory on that node ? */ 5145 /* Any regular or high memory on that node ? */
5151 static void check_for_memory(pg_data_t *pgdat, int nid) 5146 static void check_for_memory(pg_data_t *pgdat, int nid)
5152 { 5147 {
5153 enum zone_type zone_type; 5148 enum zone_type zone_type;
5154 5149
5155 if (N_MEMORY == N_NORMAL_MEMORY) 5150 if (N_MEMORY == N_NORMAL_MEMORY)
5156 return; 5151 return;
5157 5152
5158 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5153 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5159 struct zone *zone = &pgdat->node_zones[zone_type]; 5154 struct zone *zone = &pgdat->node_zones[zone_type];
5160 if (populated_zone(zone)) { 5155 if (populated_zone(zone)) {
5161 node_set_state(nid, N_HIGH_MEMORY); 5156 node_set_state(nid, N_HIGH_MEMORY);
5162 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5157 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5163 zone_type <= ZONE_NORMAL) 5158 zone_type <= ZONE_NORMAL)
5164 node_set_state(nid, N_NORMAL_MEMORY); 5159 node_set_state(nid, N_NORMAL_MEMORY);
5165 break; 5160 break;
5166 } 5161 }
5167 } 5162 }
5168 } 5163 }
5169 5164
5170 /** 5165 /**
5171 * free_area_init_nodes - Initialise all pg_data_t and zone data 5166 * free_area_init_nodes - Initialise all pg_data_t and zone data
5172 * @max_zone_pfn: an array of max PFNs for each zone 5167 * @max_zone_pfn: an array of max PFNs for each zone
5173 * 5168 *
5174 * This will call free_area_init_node() for each active node in the system. 5169 * This will call free_area_init_node() for each active node in the system.
5175 * Using the page ranges provided by add_active_range(), the size of each 5170 * Using the page ranges provided by add_active_range(), the size of each
5176 * zone in each node and their holes is calculated. If the maximum PFN 5171 * zone in each node and their holes is calculated. If the maximum PFN
5177 * between two adjacent zones match, it is assumed that the zone is empty. 5172 * between two adjacent zones match, it is assumed that the zone is empty.
5178 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5173 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5179 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5174 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5180 * starts where the previous one ended. For example, ZONE_DMA32 starts 5175 * starts where the previous one ended. For example, ZONE_DMA32 starts
5181 * at arch_max_dma_pfn. 5176 * at arch_max_dma_pfn.
5182 */ 5177 */
5183 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5178 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5184 { 5179 {
5185 unsigned long start_pfn, end_pfn; 5180 unsigned long start_pfn, end_pfn;
5186 int i, nid; 5181 int i, nid;
5187 5182
5188 /* Record where the zone boundaries are */ 5183 /* Record where the zone boundaries are */
5189 memset(arch_zone_lowest_possible_pfn, 0, 5184 memset(arch_zone_lowest_possible_pfn, 0,
5190 sizeof(arch_zone_lowest_possible_pfn)); 5185 sizeof(arch_zone_lowest_possible_pfn));
5191 memset(arch_zone_highest_possible_pfn, 0, 5186 memset(arch_zone_highest_possible_pfn, 0,
5192 sizeof(arch_zone_highest_possible_pfn)); 5187 sizeof(arch_zone_highest_possible_pfn));
5193 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5188 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5194 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5189 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5195 for (i = 1; i < MAX_NR_ZONES; i++) { 5190 for (i = 1; i < MAX_NR_ZONES; i++) {
5196 if (i == ZONE_MOVABLE) 5191 if (i == ZONE_MOVABLE)
5197 continue; 5192 continue;
5198 arch_zone_lowest_possible_pfn[i] = 5193 arch_zone_lowest_possible_pfn[i] =
5199 arch_zone_highest_possible_pfn[i-1]; 5194 arch_zone_highest_possible_pfn[i-1];
5200 arch_zone_highest_possible_pfn[i] = 5195 arch_zone_highest_possible_pfn[i] =
5201 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5196 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5202 } 5197 }
5203 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5198 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5204 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5199 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5205 5200
5206 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5201 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5207 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5202 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5208 find_zone_movable_pfns_for_nodes(); 5203 find_zone_movable_pfns_for_nodes();
5209 5204
5210 /* Print out the zone ranges */ 5205 /* Print out the zone ranges */
5211 printk("Zone ranges:\n"); 5206 printk("Zone ranges:\n");
5212 for (i = 0; i < MAX_NR_ZONES; i++) { 5207 for (i = 0; i < MAX_NR_ZONES; i++) {
5213 if (i == ZONE_MOVABLE) 5208 if (i == ZONE_MOVABLE)
5214 continue; 5209 continue;
5215 printk(KERN_CONT " %-8s ", zone_names[i]); 5210 printk(KERN_CONT " %-8s ", zone_names[i]);
5216 if (arch_zone_lowest_possible_pfn[i] == 5211 if (arch_zone_lowest_possible_pfn[i] ==
5217 arch_zone_highest_possible_pfn[i]) 5212 arch_zone_highest_possible_pfn[i])
5218 printk(KERN_CONT "empty\n"); 5213 printk(KERN_CONT "empty\n");
5219 else 5214 else
5220 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5215 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5221 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5216 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5222 (arch_zone_highest_possible_pfn[i] 5217 (arch_zone_highest_possible_pfn[i]
5223 << PAGE_SHIFT) - 1); 5218 << PAGE_SHIFT) - 1);
5224 } 5219 }
5225 5220
5226 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5221 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5227 printk("Movable zone start for each node\n"); 5222 printk("Movable zone start for each node\n");
5228 for (i = 0; i < MAX_NUMNODES; i++) { 5223 for (i = 0; i < MAX_NUMNODES; i++) {
5229 if (zone_movable_pfn[i]) 5224 if (zone_movable_pfn[i])
5230 printk(" Node %d: %#010lx\n", i, 5225 printk(" Node %d: %#010lx\n", i,
5231 zone_movable_pfn[i] << PAGE_SHIFT); 5226 zone_movable_pfn[i] << PAGE_SHIFT);
5232 } 5227 }
5233 5228
5234 /* Print out the early node map */ 5229 /* Print out the early node map */
5235 printk("Early memory node ranges\n"); 5230 printk("Early memory node ranges\n");
5236 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5231 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5237 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5232 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5238 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5233 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5239 5234
5240 /* Initialise every node */ 5235 /* Initialise every node */
5241 mminit_verify_pageflags_layout(); 5236 mminit_verify_pageflags_layout();
5242 setup_nr_node_ids(); 5237 setup_nr_node_ids();
5243 for_each_online_node(nid) { 5238 for_each_online_node(nid) {
5244 pg_data_t *pgdat = NODE_DATA(nid); 5239 pg_data_t *pgdat = NODE_DATA(nid);
5245 free_area_init_node(nid, NULL, 5240 free_area_init_node(nid, NULL,
5246 find_min_pfn_for_node(nid), NULL); 5241 find_min_pfn_for_node(nid), NULL);
5247 5242
5248 /* Any memory on that node */ 5243 /* Any memory on that node */
5249 if (pgdat->node_present_pages) 5244 if (pgdat->node_present_pages)
5250 node_set_state(nid, N_MEMORY); 5245 node_set_state(nid, N_MEMORY);
5251 check_for_memory(pgdat, nid); 5246 check_for_memory(pgdat, nid);
5252 } 5247 }
5253 } 5248 }
5254 5249
5255 static int __init cmdline_parse_core(char *p, unsigned long *core) 5250 static int __init cmdline_parse_core(char *p, unsigned long *core)
5256 { 5251 {
5257 unsigned long long coremem; 5252 unsigned long long coremem;
5258 if (!p) 5253 if (!p)
5259 return -EINVAL; 5254 return -EINVAL;
5260 5255
5261 coremem = memparse(p, &p); 5256 coremem = memparse(p, &p);
5262 *core = coremem >> PAGE_SHIFT; 5257 *core = coremem >> PAGE_SHIFT;
5263 5258
5264 /* Paranoid check that UL is enough for the coremem value */ 5259 /* Paranoid check that UL is enough for the coremem value */
5265 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5260 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5266 5261
5267 return 0; 5262 return 0;
5268 } 5263 }
5269 5264
5270 /* 5265 /*
5271 * kernelcore=size sets the amount of memory for use for allocations that 5266 * kernelcore=size sets the amount of memory for use for allocations that
5272 * cannot be reclaimed or migrated. 5267 * cannot be reclaimed or migrated.
5273 */ 5268 */
5274 static int __init cmdline_parse_kernelcore(char *p) 5269 static int __init cmdline_parse_kernelcore(char *p)
5275 { 5270 {
5276 return cmdline_parse_core(p, &required_kernelcore); 5271 return cmdline_parse_core(p, &required_kernelcore);
5277 } 5272 }
5278 5273
5279 /* 5274 /*
5280 * movablecore=size sets the amount of memory for use for allocations that 5275 * movablecore=size sets the amount of memory for use for allocations that
5281 * can be reclaimed or migrated. 5276 * can be reclaimed or migrated.
5282 */ 5277 */
5283 static int __init cmdline_parse_movablecore(char *p) 5278 static int __init cmdline_parse_movablecore(char *p)
5284 { 5279 {
5285 return cmdline_parse_core(p, &required_movablecore); 5280 return cmdline_parse_core(p, &required_movablecore);
5286 } 5281 }
5287 5282
5288 early_param("kernelcore", cmdline_parse_kernelcore); 5283 early_param("kernelcore", cmdline_parse_kernelcore);
5289 early_param("movablecore", cmdline_parse_movablecore); 5284 early_param("movablecore", cmdline_parse_movablecore);
5290 5285
5291 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5286 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5292 5287
5293 void adjust_managed_page_count(struct page *page, long count) 5288 void adjust_managed_page_count(struct page *page, long count)
5294 { 5289 {
5295 spin_lock(&managed_page_count_lock); 5290 spin_lock(&managed_page_count_lock);
5296 page_zone(page)->managed_pages += count; 5291 page_zone(page)->managed_pages += count;
5297 totalram_pages += count; 5292 totalram_pages += count;
5298 #ifdef CONFIG_HIGHMEM 5293 #ifdef CONFIG_HIGHMEM
5299 if (PageHighMem(page)) 5294 if (PageHighMem(page))
5300 totalhigh_pages += count; 5295 totalhigh_pages += count;
5301 #endif 5296 #endif
5302 spin_unlock(&managed_page_count_lock); 5297 spin_unlock(&managed_page_count_lock);
5303 } 5298 }
5304 EXPORT_SYMBOL(adjust_managed_page_count); 5299 EXPORT_SYMBOL(adjust_managed_page_count);
5305 5300
5306 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5301 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5307 { 5302 {
5308 void *pos; 5303 void *pos;
5309 unsigned long pages = 0; 5304 unsigned long pages = 0;
5310 5305
5311 start = (void *)PAGE_ALIGN((unsigned long)start); 5306 start = (void *)PAGE_ALIGN((unsigned long)start);
5312 end = (void *)((unsigned long)end & PAGE_MASK); 5307 end = (void *)((unsigned long)end & PAGE_MASK);
5313 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5308 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5314 if ((unsigned int)poison <= 0xFF) 5309 if ((unsigned int)poison <= 0xFF)
5315 memset(pos, poison, PAGE_SIZE); 5310 memset(pos, poison, PAGE_SIZE);
5316 free_reserved_page(virt_to_page(pos)); 5311 free_reserved_page(virt_to_page(pos));
5317 } 5312 }
5318 5313
5319 if (pages && s) 5314 if (pages && s)
5320 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5315 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5321 s, pages << (PAGE_SHIFT - 10), start, end); 5316 s, pages << (PAGE_SHIFT - 10), start, end);
5322 5317
5323 return pages; 5318 return pages;
5324 } 5319 }
5325 EXPORT_SYMBOL(free_reserved_area); 5320 EXPORT_SYMBOL(free_reserved_area);
5326 5321
5327 #ifdef CONFIG_HIGHMEM 5322 #ifdef CONFIG_HIGHMEM
5328 void free_highmem_page(struct page *page) 5323 void free_highmem_page(struct page *page)
5329 { 5324 {
5330 __free_reserved_page(page); 5325 __free_reserved_page(page);
5331 totalram_pages++; 5326 totalram_pages++;
5332 page_zone(page)->managed_pages++; 5327 page_zone(page)->managed_pages++;
5333 totalhigh_pages++; 5328 totalhigh_pages++;
5334 } 5329 }
5335 #endif 5330 #endif
5336 5331
5337 5332
5338 void __init mem_init_print_info(const char *str) 5333 void __init mem_init_print_info(const char *str)
5339 { 5334 {
5340 unsigned long physpages, codesize, datasize, rosize, bss_size; 5335 unsigned long physpages, codesize, datasize, rosize, bss_size;
5341 unsigned long init_code_size, init_data_size; 5336 unsigned long init_code_size, init_data_size;
5342 5337
5343 physpages = get_num_physpages(); 5338 physpages = get_num_physpages();
5344 codesize = _etext - _stext; 5339 codesize = _etext - _stext;
5345 datasize = _edata - _sdata; 5340 datasize = _edata - _sdata;
5346 rosize = __end_rodata - __start_rodata; 5341 rosize = __end_rodata - __start_rodata;
5347 bss_size = __bss_stop - __bss_start; 5342 bss_size = __bss_stop - __bss_start;
5348 init_data_size = __init_end - __init_begin; 5343 init_data_size = __init_end - __init_begin;
5349 init_code_size = _einittext - _sinittext; 5344 init_code_size = _einittext - _sinittext;
5350 5345
5351 /* 5346 /*
5352 * Detect special cases and adjust section sizes accordingly: 5347 * Detect special cases and adjust section sizes accordingly:
5353 * 1) .init.* may be embedded into .data sections 5348 * 1) .init.* may be embedded into .data sections
5354 * 2) .init.text.* may be out of [__init_begin, __init_end], 5349 * 2) .init.text.* may be out of [__init_begin, __init_end],
5355 * please refer to arch/tile/kernel/vmlinux.lds.S. 5350 * please refer to arch/tile/kernel/vmlinux.lds.S.
5356 * 3) .rodata.* may be embedded into .text or .data sections. 5351 * 3) .rodata.* may be embedded into .text or .data sections.
5357 */ 5352 */
5358 #define adj_init_size(start, end, size, pos, adj) \ 5353 #define adj_init_size(start, end, size, pos, adj) \
5359 do { \ 5354 do { \
5360 if (start <= pos && pos < end && size > adj) \ 5355 if (start <= pos && pos < end && size > adj) \
5361 size -= adj; \ 5356 size -= adj; \
5362 } while (0) 5357 } while (0)
5363 5358
5364 adj_init_size(__init_begin, __init_end, init_data_size, 5359 adj_init_size(__init_begin, __init_end, init_data_size,
5365 _sinittext, init_code_size); 5360 _sinittext, init_code_size);
5366 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5361 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5367 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5362 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5368 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5363 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5369 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5364 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5370 5365
5371 #undef adj_init_size 5366 #undef adj_init_size
5372 5367
5373 printk("Memory: %luK/%luK available " 5368 printk("Memory: %luK/%luK available "
5374 "(%luK kernel code, %luK rwdata, %luK rodata, " 5369 "(%luK kernel code, %luK rwdata, %luK rodata, "
5375 "%luK init, %luK bss, %luK reserved" 5370 "%luK init, %luK bss, %luK reserved"
5376 #ifdef CONFIG_HIGHMEM 5371 #ifdef CONFIG_HIGHMEM
5377 ", %luK highmem" 5372 ", %luK highmem"
5378 #endif 5373 #endif
5379 "%s%s)\n", 5374 "%s%s)\n",
5380 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5375 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5381 codesize >> 10, datasize >> 10, rosize >> 10, 5376 codesize >> 10, datasize >> 10, rosize >> 10,
5382 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5377 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5383 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5378 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5384 #ifdef CONFIG_HIGHMEM 5379 #ifdef CONFIG_HIGHMEM
5385 totalhigh_pages << (PAGE_SHIFT-10), 5380 totalhigh_pages << (PAGE_SHIFT-10),
5386 #endif 5381 #endif
5387 str ? ", " : "", str ? str : ""); 5382 str ? ", " : "", str ? str : "");
5388 } 5383 }
5389 5384
5390 /** 5385 /**
5391 * set_dma_reserve - set the specified number of pages reserved in the first zone 5386 * set_dma_reserve - set the specified number of pages reserved in the first zone
5392 * @new_dma_reserve: The number of pages to mark reserved 5387 * @new_dma_reserve: The number of pages to mark reserved
5393 * 5388 *
5394 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5389 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5395 * In the DMA zone, a significant percentage may be consumed by kernel image 5390 * In the DMA zone, a significant percentage may be consumed by kernel image
5396 * and other unfreeable allocations which can skew the watermarks badly. This 5391 * and other unfreeable allocations which can skew the watermarks badly. This
5397 * function may optionally be used to account for unfreeable pages in the 5392 * function may optionally be used to account for unfreeable pages in the
5398 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5393 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5399 * smaller per-cpu batchsize. 5394 * smaller per-cpu batchsize.
5400 */ 5395 */
5401 void __init set_dma_reserve(unsigned long new_dma_reserve) 5396 void __init set_dma_reserve(unsigned long new_dma_reserve)
5402 { 5397 {
5403 dma_reserve = new_dma_reserve; 5398 dma_reserve = new_dma_reserve;
5404 } 5399 }
5405 5400
5406 void __init free_area_init(unsigned long *zones_size) 5401 void __init free_area_init(unsigned long *zones_size)
5407 { 5402 {
5408 free_area_init_node(0, zones_size, 5403 free_area_init_node(0, zones_size,
5409 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5404 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5410 } 5405 }
5411 5406
5412 static int page_alloc_cpu_notify(struct notifier_block *self, 5407 static int page_alloc_cpu_notify(struct notifier_block *self,
5413 unsigned long action, void *hcpu) 5408 unsigned long action, void *hcpu)
5414 { 5409 {
5415 int cpu = (unsigned long)hcpu; 5410 int cpu = (unsigned long)hcpu;
5416 5411
5417 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5412 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5418 lru_add_drain_cpu(cpu); 5413 lru_add_drain_cpu(cpu);
5419 drain_pages(cpu); 5414 drain_pages(cpu);
5420 5415
5421 /* 5416 /*
5422 * Spill the event counters of the dead processor 5417 * Spill the event counters of the dead processor
5423 * into the current processors event counters. 5418 * into the current processors event counters.
5424 * This artificially elevates the count of the current 5419 * This artificially elevates the count of the current
5425 * processor. 5420 * processor.
5426 */ 5421 */
5427 vm_events_fold_cpu(cpu); 5422 vm_events_fold_cpu(cpu);
5428 5423
5429 /* 5424 /*
5430 * Zero the differential counters of the dead processor 5425 * Zero the differential counters of the dead processor
5431 * so that the vm statistics are consistent. 5426 * so that the vm statistics are consistent.
5432 * 5427 *
5433 * This is only okay since the processor is dead and cannot 5428 * This is only okay since the processor is dead and cannot
5434 * race with what we are doing. 5429 * race with what we are doing.
5435 */ 5430 */
5436 cpu_vm_stats_fold(cpu); 5431 cpu_vm_stats_fold(cpu);
5437 } 5432 }
5438 return NOTIFY_OK; 5433 return NOTIFY_OK;
5439 } 5434 }
5440 5435
5441 void __init page_alloc_init(void) 5436 void __init page_alloc_init(void)
5442 { 5437 {
5443 hotcpu_notifier(page_alloc_cpu_notify, 0); 5438 hotcpu_notifier(page_alloc_cpu_notify, 0);
5444 } 5439 }
5445 5440
5446 /* 5441 /*
5447 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5442 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5448 * or min_free_kbytes changes. 5443 * or min_free_kbytes changes.
5449 */ 5444 */
5450 static void calculate_totalreserve_pages(void) 5445 static void calculate_totalreserve_pages(void)
5451 { 5446 {
5452 struct pglist_data *pgdat; 5447 struct pglist_data *pgdat;
5453 unsigned long reserve_pages = 0; 5448 unsigned long reserve_pages = 0;
5454 enum zone_type i, j; 5449 enum zone_type i, j;
5455 5450
5456 for_each_online_pgdat(pgdat) { 5451 for_each_online_pgdat(pgdat) {
5457 for (i = 0; i < MAX_NR_ZONES; i++) { 5452 for (i = 0; i < MAX_NR_ZONES; i++) {
5458 struct zone *zone = pgdat->node_zones + i; 5453 struct zone *zone = pgdat->node_zones + i;
5459 unsigned long max = 0; 5454 unsigned long max = 0;
5460 5455
5461 /* Find valid and maximum lowmem_reserve in the zone */ 5456 /* Find valid and maximum lowmem_reserve in the zone */
5462 for (j = i; j < MAX_NR_ZONES; j++) { 5457 for (j = i; j < MAX_NR_ZONES; j++) {
5463 if (zone->lowmem_reserve[j] > max) 5458 if (zone->lowmem_reserve[j] > max)
5464 max = zone->lowmem_reserve[j]; 5459 max = zone->lowmem_reserve[j];
5465 } 5460 }
5466 5461
5467 /* we treat the high watermark as reserved pages. */ 5462 /* we treat the high watermark as reserved pages. */
5468 max += high_wmark_pages(zone); 5463 max += high_wmark_pages(zone);
5469 5464
5470 if (max > zone->managed_pages) 5465 if (max > zone->managed_pages)
5471 max = zone->managed_pages; 5466 max = zone->managed_pages;
5472 reserve_pages += max; 5467 reserve_pages += max;
5473 /* 5468 /*
5474 * Lowmem reserves are not available to 5469 * Lowmem reserves are not available to
5475 * GFP_HIGHUSER page cache allocations and 5470 * GFP_HIGHUSER page cache allocations and
5476 * kswapd tries to balance zones to their high 5471 * kswapd tries to balance zones to their high
5477 * watermark. As a result, neither should be 5472 * watermark. As a result, neither should be
5478 * regarded as dirtyable memory, to prevent a 5473 * regarded as dirtyable memory, to prevent a
5479 * situation where reclaim has to clean pages 5474 * situation where reclaim has to clean pages
5480 * in order to balance the zones. 5475 * in order to balance the zones.
5481 */ 5476 */
5482 zone->dirty_balance_reserve = max; 5477 zone->dirty_balance_reserve = max;
5483 } 5478 }
5484 } 5479 }
5485 dirty_balance_reserve = reserve_pages; 5480 dirty_balance_reserve = reserve_pages;
5486 totalreserve_pages = reserve_pages; 5481 totalreserve_pages = reserve_pages;
5487 } 5482 }
5488 5483
5489 /* 5484 /*
5490 * setup_per_zone_lowmem_reserve - called whenever 5485 * setup_per_zone_lowmem_reserve - called whenever
5491 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5486 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5492 * has a correct pages reserved value, so an adequate number of 5487 * has a correct pages reserved value, so an adequate number of
5493 * pages are left in the zone after a successful __alloc_pages(). 5488 * pages are left in the zone after a successful __alloc_pages().
5494 */ 5489 */
5495 static void setup_per_zone_lowmem_reserve(void) 5490 static void setup_per_zone_lowmem_reserve(void)
5496 { 5491 {
5497 struct pglist_data *pgdat; 5492 struct pglist_data *pgdat;
5498 enum zone_type j, idx; 5493 enum zone_type j, idx;
5499 5494
5500 for_each_online_pgdat(pgdat) { 5495 for_each_online_pgdat(pgdat) {
5501 for (j = 0; j < MAX_NR_ZONES; j++) { 5496 for (j = 0; j < MAX_NR_ZONES; j++) {
5502 struct zone *zone = pgdat->node_zones + j; 5497 struct zone *zone = pgdat->node_zones + j;
5503 unsigned long managed_pages = zone->managed_pages; 5498 unsigned long managed_pages = zone->managed_pages;
5504 5499
5505 zone->lowmem_reserve[j] = 0; 5500 zone->lowmem_reserve[j] = 0;
5506 5501
5507 idx = j; 5502 idx = j;
5508 while (idx) { 5503 while (idx) {
5509 struct zone *lower_zone; 5504 struct zone *lower_zone;
5510 5505
5511 idx--; 5506 idx--;
5512 5507
5513 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5508 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5514 sysctl_lowmem_reserve_ratio[idx] = 1; 5509 sysctl_lowmem_reserve_ratio[idx] = 1;
5515 5510
5516 lower_zone = pgdat->node_zones + idx; 5511 lower_zone = pgdat->node_zones + idx;
5517 lower_zone->lowmem_reserve[j] = managed_pages / 5512 lower_zone->lowmem_reserve[j] = managed_pages /
5518 sysctl_lowmem_reserve_ratio[idx]; 5513 sysctl_lowmem_reserve_ratio[idx];
5519 managed_pages += lower_zone->managed_pages; 5514 managed_pages += lower_zone->managed_pages;
5520 } 5515 }
5521 } 5516 }
5522 } 5517 }
5523 5518
5524 /* update totalreserve_pages */ 5519 /* update totalreserve_pages */
5525 calculate_totalreserve_pages(); 5520 calculate_totalreserve_pages();
5526 } 5521 }
5527 5522
5528 static void __setup_per_zone_wmarks(void) 5523 static void __setup_per_zone_wmarks(void)
5529 { 5524 {
5530 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5525 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5531 unsigned long lowmem_pages = 0; 5526 unsigned long lowmem_pages = 0;
5532 struct zone *zone; 5527 struct zone *zone;
5533 unsigned long flags; 5528 unsigned long flags;
5534 5529
5535 /* Calculate total number of !ZONE_HIGHMEM pages */ 5530 /* Calculate total number of !ZONE_HIGHMEM pages */
5536 for_each_zone(zone) { 5531 for_each_zone(zone) {
5537 if (!is_highmem(zone)) 5532 if (!is_highmem(zone))
5538 lowmem_pages += zone->managed_pages; 5533 lowmem_pages += zone->managed_pages;
5539 } 5534 }
5540 5535
5541 for_each_zone(zone) { 5536 for_each_zone(zone) {
5542 u64 tmp; 5537 u64 tmp;
5543 5538
5544 spin_lock_irqsave(&zone->lock, flags); 5539 spin_lock_irqsave(&zone->lock, flags);
5545 tmp = (u64)pages_min * zone->managed_pages; 5540 tmp = (u64)pages_min * zone->managed_pages;
5546 do_div(tmp, lowmem_pages); 5541 do_div(tmp, lowmem_pages);
5547 if (is_highmem(zone)) { 5542 if (is_highmem(zone)) {
5548 /* 5543 /*
5549 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5544 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5550 * need highmem pages, so cap pages_min to a small 5545 * need highmem pages, so cap pages_min to a small
5551 * value here. 5546 * value here.
5552 * 5547 *
5553 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5548 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5554 * deltas controls asynch page reclaim, and so should 5549 * deltas controls asynch page reclaim, and so should
5555 * not be capped for highmem. 5550 * not be capped for highmem.
5556 */ 5551 */
5557 unsigned long min_pages; 5552 unsigned long min_pages;
5558 5553
5559 min_pages = zone->managed_pages / 1024; 5554 min_pages = zone->managed_pages / 1024;
5560 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5555 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5561 zone->watermark[WMARK_MIN] = min_pages; 5556 zone->watermark[WMARK_MIN] = min_pages;
5562 } else { 5557 } else {
5563 /* 5558 /*
5564 * If it's a lowmem zone, reserve a number of pages 5559 * If it's a lowmem zone, reserve a number of pages
5565 * proportionate to the zone's size. 5560 * proportionate to the zone's size.
5566 */ 5561 */
5567 zone->watermark[WMARK_MIN] = tmp; 5562 zone->watermark[WMARK_MIN] = tmp;
5568 } 5563 }
5569 5564
5570 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5565 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5571 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5566 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5572 5567
5573 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5568 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5574 high_wmark_pages(zone) - 5569 high_wmark_pages(zone) -
5575 low_wmark_pages(zone) - 5570 low_wmark_pages(zone) -
5576 zone_page_state(zone, NR_ALLOC_BATCH)); 5571 zone_page_state(zone, NR_ALLOC_BATCH));
5577 5572
5578 setup_zone_migrate_reserve(zone); 5573 setup_zone_migrate_reserve(zone);
5579 spin_unlock_irqrestore(&zone->lock, flags); 5574 spin_unlock_irqrestore(&zone->lock, flags);
5580 } 5575 }
5581 5576
5582 /* update totalreserve_pages */ 5577 /* update totalreserve_pages */
5583 calculate_totalreserve_pages(); 5578 calculate_totalreserve_pages();
5584 } 5579 }
5585 5580
5586 /** 5581 /**
5587 * setup_per_zone_wmarks - called when min_free_kbytes changes 5582 * setup_per_zone_wmarks - called when min_free_kbytes changes
5588 * or when memory is hot-{added|removed} 5583 * or when memory is hot-{added|removed}
5589 * 5584 *
5590 * Ensures that the watermark[min,low,high] values for each zone are set 5585 * Ensures that the watermark[min,low,high] values for each zone are set
5591 * correctly with respect to min_free_kbytes. 5586 * correctly with respect to min_free_kbytes.
5592 */ 5587 */
5593 void setup_per_zone_wmarks(void) 5588 void setup_per_zone_wmarks(void)
5594 { 5589 {
5595 mutex_lock(&zonelists_mutex); 5590 mutex_lock(&zonelists_mutex);
5596 __setup_per_zone_wmarks(); 5591 __setup_per_zone_wmarks();
5597 mutex_unlock(&zonelists_mutex); 5592 mutex_unlock(&zonelists_mutex);
5598 } 5593 }
5599 5594
5600 /* 5595 /*
5601 * The inactive anon list should be small enough that the VM never has to 5596 * The inactive anon list should be small enough that the VM never has to
5602 * do too much work, but large enough that each inactive page has a chance 5597 * do too much work, but large enough that each inactive page has a chance
5603 * to be referenced again before it is swapped out. 5598 * to be referenced again before it is swapped out.
5604 * 5599 *
5605 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5600 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5606 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5601 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5607 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5602 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5608 * the anonymous pages are kept on the inactive list. 5603 * the anonymous pages are kept on the inactive list.
5609 * 5604 *
5610 * total target max 5605 * total target max
5611 * memory ratio inactive anon 5606 * memory ratio inactive anon
5612 * ------------------------------------- 5607 * -------------------------------------
5613 * 10MB 1 5MB 5608 * 10MB 1 5MB
5614 * 100MB 1 50MB 5609 * 100MB 1 50MB
5615 * 1GB 3 250MB 5610 * 1GB 3 250MB
5616 * 10GB 10 0.9GB 5611 * 10GB 10 0.9GB
5617 * 100GB 31 3GB 5612 * 100GB 31 3GB
5618 * 1TB 101 10GB 5613 * 1TB 101 10GB
5619 * 10TB 320 32GB 5614 * 10TB 320 32GB
5620 */ 5615 */
5621 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5616 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5622 { 5617 {
5623 unsigned int gb, ratio; 5618 unsigned int gb, ratio;
5624 5619
5625 /* Zone size in gigabytes */ 5620 /* Zone size in gigabytes */
5626 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5621 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5627 if (gb) 5622 if (gb)
5628 ratio = int_sqrt(10 * gb); 5623 ratio = int_sqrt(10 * gb);
5629 else 5624 else
5630 ratio = 1; 5625 ratio = 1;
5631 5626
5632 zone->inactive_ratio = ratio; 5627 zone->inactive_ratio = ratio;
5633 } 5628 }
5634 5629
5635 static void __meminit setup_per_zone_inactive_ratio(void) 5630 static void __meminit setup_per_zone_inactive_ratio(void)
5636 { 5631 {
5637 struct zone *zone; 5632 struct zone *zone;
5638 5633
5639 for_each_zone(zone) 5634 for_each_zone(zone)
5640 calculate_zone_inactive_ratio(zone); 5635 calculate_zone_inactive_ratio(zone);
5641 } 5636 }
5642 5637
5643 /* 5638 /*
5644 * Initialise min_free_kbytes. 5639 * Initialise min_free_kbytes.
5645 * 5640 *
5646 * For small machines we want it small (128k min). For large machines 5641 * For small machines we want it small (128k min). For large machines
5647 * we want it large (64MB max). But it is not linear, because network 5642 * we want it large (64MB max). But it is not linear, because network
5648 * bandwidth does not increase linearly with machine size. We use 5643 * bandwidth does not increase linearly with machine size. We use
5649 * 5644 *
5650 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5645 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5651 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5646 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5652 * 5647 *
5653 * which yields 5648 * which yields
5654 * 5649 *
5655 * 16MB: 512k 5650 * 16MB: 512k
5656 * 32MB: 724k 5651 * 32MB: 724k
5657 * 64MB: 1024k 5652 * 64MB: 1024k
5658 * 128MB: 1448k 5653 * 128MB: 1448k
5659 * 256MB: 2048k 5654 * 256MB: 2048k
5660 * 512MB: 2896k 5655 * 512MB: 2896k
5661 * 1024MB: 4096k 5656 * 1024MB: 4096k
5662 * 2048MB: 5792k 5657 * 2048MB: 5792k
5663 * 4096MB: 8192k 5658 * 4096MB: 8192k
5664 * 8192MB: 11584k 5659 * 8192MB: 11584k
5665 * 16384MB: 16384k 5660 * 16384MB: 16384k
5666 */ 5661 */
5667 int __meminit init_per_zone_wmark_min(void) 5662 int __meminit init_per_zone_wmark_min(void)
5668 { 5663 {
5669 unsigned long lowmem_kbytes; 5664 unsigned long lowmem_kbytes;
5670 int new_min_free_kbytes; 5665 int new_min_free_kbytes;
5671 5666
5672 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5667 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5673 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5668 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5674 5669
5675 if (new_min_free_kbytes > user_min_free_kbytes) { 5670 if (new_min_free_kbytes > user_min_free_kbytes) {
5676 min_free_kbytes = new_min_free_kbytes; 5671 min_free_kbytes = new_min_free_kbytes;
5677 if (min_free_kbytes < 128) 5672 if (min_free_kbytes < 128)
5678 min_free_kbytes = 128; 5673 min_free_kbytes = 128;
5679 if (min_free_kbytes > 65536) 5674 if (min_free_kbytes > 65536)
5680 min_free_kbytes = 65536; 5675 min_free_kbytes = 65536;
5681 } else { 5676 } else {
5682 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5677 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5683 new_min_free_kbytes, user_min_free_kbytes); 5678 new_min_free_kbytes, user_min_free_kbytes);
5684 } 5679 }
5685 setup_per_zone_wmarks(); 5680 setup_per_zone_wmarks();
5686 refresh_zone_stat_thresholds(); 5681 refresh_zone_stat_thresholds();
5687 setup_per_zone_lowmem_reserve(); 5682 setup_per_zone_lowmem_reserve();
5688 setup_per_zone_inactive_ratio(); 5683 setup_per_zone_inactive_ratio();
5689 return 0; 5684 return 0;
5690 } 5685 }
5691 module_init(init_per_zone_wmark_min) 5686 module_init(init_per_zone_wmark_min)
5692 5687
5693 /* 5688 /*
5694 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5689 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5695 * that we can call two helper functions whenever min_free_kbytes 5690 * that we can call two helper functions whenever min_free_kbytes
5696 * changes. 5691 * changes.
5697 */ 5692 */
5698 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5693 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5699 void __user *buffer, size_t *length, loff_t *ppos) 5694 void __user *buffer, size_t *length, loff_t *ppos)
5700 { 5695 {
5701 proc_dointvec(table, write, buffer, length, ppos); 5696 proc_dointvec(table, write, buffer, length, ppos);
5702 if (write) { 5697 if (write) {
5703 user_min_free_kbytes = min_free_kbytes; 5698 user_min_free_kbytes = min_free_kbytes;
5704 setup_per_zone_wmarks(); 5699 setup_per_zone_wmarks();
5705 } 5700 }
5706 return 0; 5701 return 0;
5707 } 5702 }
5708 5703
5709 #ifdef CONFIG_NUMA 5704 #ifdef CONFIG_NUMA
5710 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5705 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5711 void __user *buffer, size_t *length, loff_t *ppos) 5706 void __user *buffer, size_t *length, loff_t *ppos)
5712 { 5707 {
5713 struct zone *zone; 5708 struct zone *zone;
5714 int rc; 5709 int rc;
5715 5710
5716 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5711 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5717 if (rc) 5712 if (rc)
5718 return rc; 5713 return rc;
5719 5714
5720 for_each_zone(zone) 5715 for_each_zone(zone)
5721 zone->min_unmapped_pages = (zone->managed_pages * 5716 zone->min_unmapped_pages = (zone->managed_pages *
5722 sysctl_min_unmapped_ratio) / 100; 5717 sysctl_min_unmapped_ratio) / 100;
5723 return 0; 5718 return 0;
5724 } 5719 }
5725 5720
5726 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5721 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5727 void __user *buffer, size_t *length, loff_t *ppos) 5722 void __user *buffer, size_t *length, loff_t *ppos)
5728 { 5723 {
5729 struct zone *zone; 5724 struct zone *zone;
5730 int rc; 5725 int rc;
5731 5726
5732 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5727 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5733 if (rc) 5728 if (rc)
5734 return rc; 5729 return rc;
5735 5730
5736 for_each_zone(zone) 5731 for_each_zone(zone)
5737 zone->min_slab_pages = (zone->managed_pages * 5732 zone->min_slab_pages = (zone->managed_pages *
5738 sysctl_min_slab_ratio) / 100; 5733 sysctl_min_slab_ratio) / 100;
5739 return 0; 5734 return 0;
5740 } 5735 }
5741 #endif 5736 #endif
5742 5737
5743 /* 5738 /*
5744 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5739 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5745 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5740 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5746 * whenever sysctl_lowmem_reserve_ratio changes. 5741 * whenever sysctl_lowmem_reserve_ratio changes.
5747 * 5742 *
5748 * The reserve ratio obviously has absolutely no relation with the 5743 * The reserve ratio obviously has absolutely no relation with the
5749 * minimum watermarks. The lowmem reserve ratio can only make sense 5744 * minimum watermarks. The lowmem reserve ratio can only make sense
5750 * if in function of the boot time zone sizes. 5745 * if in function of the boot time zone sizes.
5751 */ 5746 */
5752 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5747 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5753 void __user *buffer, size_t *length, loff_t *ppos) 5748 void __user *buffer, size_t *length, loff_t *ppos)
5754 { 5749 {
5755 proc_dointvec_minmax(table, write, buffer, length, ppos); 5750 proc_dointvec_minmax(table, write, buffer, length, ppos);
5756 setup_per_zone_lowmem_reserve(); 5751 setup_per_zone_lowmem_reserve();
5757 return 0; 5752 return 0;
5758 } 5753 }
5759 5754
5760 /* 5755 /*
5761 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5756 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5762 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5757 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5763 * pagelist can have before it gets flushed back to buddy allocator. 5758 * pagelist can have before it gets flushed back to buddy allocator.
5764 */ 5759 */
5765 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5760 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5766 void __user *buffer, size_t *length, loff_t *ppos) 5761 void __user *buffer, size_t *length, loff_t *ppos)
5767 { 5762 {
5768 struct zone *zone; 5763 struct zone *zone;
5769 unsigned int cpu; 5764 unsigned int cpu;
5770 int ret; 5765 int ret;
5771 5766
5772 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5767 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5773 if (!write || (ret < 0)) 5768 if (!write || (ret < 0))
5774 return ret; 5769 return ret;
5775 5770
5776 mutex_lock(&pcp_batch_high_lock); 5771 mutex_lock(&pcp_batch_high_lock);
5777 for_each_populated_zone(zone) { 5772 for_each_populated_zone(zone) {
5778 unsigned long high; 5773 unsigned long high;
5779 high = zone->managed_pages / percpu_pagelist_fraction; 5774 high = zone->managed_pages / percpu_pagelist_fraction;
5780 for_each_possible_cpu(cpu) 5775 for_each_possible_cpu(cpu)
5781 pageset_set_high(per_cpu_ptr(zone->pageset, cpu), 5776 pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5782 high); 5777 high);
5783 } 5778 }
5784 mutex_unlock(&pcp_batch_high_lock); 5779 mutex_unlock(&pcp_batch_high_lock);
5785 return 0; 5780 return 0;
5786 } 5781 }
5787 5782
5788 int hashdist = HASHDIST_DEFAULT; 5783 int hashdist = HASHDIST_DEFAULT;
5789 5784
5790 #ifdef CONFIG_NUMA 5785 #ifdef CONFIG_NUMA
5791 static int __init set_hashdist(char *str) 5786 static int __init set_hashdist(char *str)
5792 { 5787 {
5793 if (!str) 5788 if (!str)
5794 return 0; 5789 return 0;
5795 hashdist = simple_strtoul(str, &str, 0); 5790 hashdist = simple_strtoul(str, &str, 0);
5796 return 1; 5791 return 1;
5797 } 5792 }
5798 __setup("hashdist=", set_hashdist); 5793 __setup("hashdist=", set_hashdist);
5799 #endif 5794 #endif
5800 5795
5801 /* 5796 /*
5802 * allocate a large system hash table from bootmem 5797 * allocate a large system hash table from bootmem
5803 * - it is assumed that the hash table must contain an exact power-of-2 5798 * - it is assumed that the hash table must contain an exact power-of-2
5804 * quantity of entries 5799 * quantity of entries
5805 * - limit is the number of hash buckets, not the total allocation size 5800 * - limit is the number of hash buckets, not the total allocation size
5806 */ 5801 */
5807 void *__init alloc_large_system_hash(const char *tablename, 5802 void *__init alloc_large_system_hash(const char *tablename,
5808 unsigned long bucketsize, 5803 unsigned long bucketsize,
5809 unsigned long numentries, 5804 unsigned long numentries,
5810 int scale, 5805 int scale,
5811 int flags, 5806 int flags,
5812 unsigned int *_hash_shift, 5807 unsigned int *_hash_shift,
5813 unsigned int *_hash_mask, 5808 unsigned int *_hash_mask,
5814 unsigned long low_limit, 5809 unsigned long low_limit,
5815 unsigned long high_limit) 5810 unsigned long high_limit)
5816 { 5811 {
5817 unsigned long long max = high_limit; 5812 unsigned long long max = high_limit;
5818 unsigned long log2qty, size; 5813 unsigned long log2qty, size;
5819 void *table = NULL; 5814 void *table = NULL;
5820 5815
5821 /* allow the kernel cmdline to have a say */ 5816 /* allow the kernel cmdline to have a say */
5822 if (!numentries) { 5817 if (!numentries) {
5823 /* round applicable memory size up to nearest megabyte */ 5818 /* round applicable memory size up to nearest megabyte */
5824 numentries = nr_kernel_pages; 5819 numentries = nr_kernel_pages;
5825 5820
5826 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5821 /* It isn't necessary when PAGE_SIZE >= 1MB */
5827 if (PAGE_SHIFT < 20) 5822 if (PAGE_SHIFT < 20)
5828 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5823 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5829 5824
5830 /* limit to 1 bucket per 2^scale bytes of low memory */ 5825 /* limit to 1 bucket per 2^scale bytes of low memory */
5831 if (scale > PAGE_SHIFT) 5826 if (scale > PAGE_SHIFT)
5832 numentries >>= (scale - PAGE_SHIFT); 5827 numentries >>= (scale - PAGE_SHIFT);
5833 else 5828 else
5834 numentries <<= (PAGE_SHIFT - scale); 5829 numentries <<= (PAGE_SHIFT - scale);
5835 5830
5836 /* Make sure we've got at least a 0-order allocation.. */ 5831 /* Make sure we've got at least a 0-order allocation.. */
5837 if (unlikely(flags & HASH_SMALL)) { 5832 if (unlikely(flags & HASH_SMALL)) {
5838 /* Makes no sense without HASH_EARLY */ 5833 /* Makes no sense without HASH_EARLY */
5839 WARN_ON(!(flags & HASH_EARLY)); 5834 WARN_ON(!(flags & HASH_EARLY));
5840 if (!(numentries >> *_hash_shift)) { 5835 if (!(numentries >> *_hash_shift)) {
5841 numentries = 1UL << *_hash_shift; 5836 numentries = 1UL << *_hash_shift;
5842 BUG_ON(!numentries); 5837 BUG_ON(!numentries);
5843 } 5838 }
5844 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5839 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5845 numentries = PAGE_SIZE / bucketsize; 5840 numentries = PAGE_SIZE / bucketsize;
5846 } 5841 }
5847 numentries = roundup_pow_of_two(numentries); 5842 numentries = roundup_pow_of_two(numentries);
5848 5843
5849 /* limit allocation size to 1/16 total memory by default */ 5844 /* limit allocation size to 1/16 total memory by default */
5850 if (max == 0) { 5845 if (max == 0) {
5851 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5846 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5852 do_div(max, bucketsize); 5847 do_div(max, bucketsize);
5853 } 5848 }
5854 max = min(max, 0x80000000ULL); 5849 max = min(max, 0x80000000ULL);
5855 5850
5856 if (numentries < low_limit) 5851 if (numentries < low_limit)
5857 numentries = low_limit; 5852 numentries = low_limit;
5858 if (numentries > max) 5853 if (numentries > max)
5859 numentries = max; 5854 numentries = max;
5860 5855
5861 log2qty = ilog2(numentries); 5856 log2qty = ilog2(numentries);
5862 5857
5863 do { 5858 do {
5864 size = bucketsize << log2qty; 5859 size = bucketsize << log2qty;
5865 if (flags & HASH_EARLY) 5860 if (flags & HASH_EARLY)
5866 table = alloc_bootmem_nopanic(size); 5861 table = alloc_bootmem_nopanic(size);
5867 else if (hashdist) 5862 else if (hashdist)
5868 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5863 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5869 else { 5864 else {
5870 /* 5865 /*
5871 * If bucketsize is not a power-of-two, we may free 5866 * If bucketsize is not a power-of-two, we may free
5872 * some pages at the end of hash table which 5867 * some pages at the end of hash table which
5873 * alloc_pages_exact() automatically does 5868 * alloc_pages_exact() automatically does
5874 */ 5869 */
5875 if (get_order(size) < MAX_ORDER) { 5870 if (get_order(size) < MAX_ORDER) {
5876 table = alloc_pages_exact(size, GFP_ATOMIC); 5871 table = alloc_pages_exact(size, GFP_ATOMIC);
5877 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5872 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5878 } 5873 }
5879 } 5874 }
5880 } while (!table && size > PAGE_SIZE && --log2qty); 5875 } while (!table && size > PAGE_SIZE && --log2qty);
5881 5876
5882 if (!table) 5877 if (!table)
5883 panic("Failed to allocate %s hash table\n", tablename); 5878 panic("Failed to allocate %s hash table\n", tablename);
5884 5879
5885 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5880 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5886 tablename, 5881 tablename,
5887 (1UL << log2qty), 5882 (1UL << log2qty),
5888 ilog2(size) - PAGE_SHIFT, 5883 ilog2(size) - PAGE_SHIFT,
5889 size); 5884 size);
5890 5885
5891 if (_hash_shift) 5886 if (_hash_shift)
5892 *_hash_shift = log2qty; 5887 *_hash_shift = log2qty;
5893 if (_hash_mask) 5888 if (_hash_mask)
5894 *_hash_mask = (1 << log2qty) - 1; 5889 *_hash_mask = (1 << log2qty) - 1;
5895 5890
5896 return table; 5891 return table;
5897 } 5892 }
5898 5893
5899 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5894 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5900 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5895 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5901 unsigned long pfn) 5896 unsigned long pfn)
5902 { 5897 {
5903 #ifdef CONFIG_SPARSEMEM 5898 #ifdef CONFIG_SPARSEMEM
5904 return __pfn_to_section(pfn)->pageblock_flags; 5899 return __pfn_to_section(pfn)->pageblock_flags;
5905 #else 5900 #else
5906 return zone->pageblock_flags; 5901 return zone->pageblock_flags;
5907 #endif /* CONFIG_SPARSEMEM */ 5902 #endif /* CONFIG_SPARSEMEM */
5908 } 5903 }
5909 5904
5910 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5905 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5911 { 5906 {
5912 #ifdef CONFIG_SPARSEMEM 5907 #ifdef CONFIG_SPARSEMEM
5913 pfn &= (PAGES_PER_SECTION-1); 5908 pfn &= (PAGES_PER_SECTION-1);
5914 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5909 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5915 #else 5910 #else
5916 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 5911 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5917 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5912 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5918 #endif /* CONFIG_SPARSEMEM */ 5913 #endif /* CONFIG_SPARSEMEM */
5919 } 5914 }
5920 5915
5921 /** 5916 /**
5922 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5917 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5923 * @page: The page within the block of interest 5918 * @page: The page within the block of interest
5924 * @start_bitidx: The first bit of interest to retrieve 5919 * @start_bitidx: The first bit of interest to retrieve
5925 * @end_bitidx: The last bit of interest 5920 * @end_bitidx: The last bit of interest
5926 * returns pageblock_bits flags 5921 * returns pageblock_bits flags
5927 */ 5922 */
5928 unsigned long get_pageblock_flags_group(struct page *page, 5923 unsigned long get_pageblock_flags_group(struct page *page,
5929 int start_bitidx, int end_bitidx) 5924 int start_bitidx, int end_bitidx)
5930 { 5925 {
5931 struct zone *zone; 5926 struct zone *zone;
5932 unsigned long *bitmap; 5927 unsigned long *bitmap;
5933 unsigned long pfn, bitidx; 5928 unsigned long pfn, bitidx;
5934 unsigned long flags = 0; 5929 unsigned long flags = 0;
5935 unsigned long value = 1; 5930 unsigned long value = 1;
5936 5931
5937 zone = page_zone(page); 5932 zone = page_zone(page);
5938 pfn = page_to_pfn(page); 5933 pfn = page_to_pfn(page);
5939 bitmap = get_pageblock_bitmap(zone, pfn); 5934 bitmap = get_pageblock_bitmap(zone, pfn);
5940 bitidx = pfn_to_bitidx(zone, pfn); 5935 bitidx = pfn_to_bitidx(zone, pfn);
5941 5936
5942 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5937 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5943 if (test_bit(bitidx + start_bitidx, bitmap)) 5938 if (test_bit(bitidx + start_bitidx, bitmap))
5944 flags |= value; 5939 flags |= value;
5945 5940
5946 return flags; 5941 return flags;
5947 } 5942 }
5948 5943
5949 /** 5944 /**
5950 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 5945 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5951 * @page: The page within the block of interest 5946 * @page: The page within the block of interest
5952 * @start_bitidx: The first bit of interest 5947 * @start_bitidx: The first bit of interest
5953 * @end_bitidx: The last bit of interest 5948 * @end_bitidx: The last bit of interest
5954 * @flags: The flags to set 5949 * @flags: The flags to set
5955 */ 5950 */
5956 void set_pageblock_flags_group(struct page *page, unsigned long flags, 5951 void set_pageblock_flags_group(struct page *page, unsigned long flags,
5957 int start_bitidx, int end_bitidx) 5952 int start_bitidx, int end_bitidx)
5958 { 5953 {
5959 struct zone *zone; 5954 struct zone *zone;
5960 unsigned long *bitmap; 5955 unsigned long *bitmap;
5961 unsigned long pfn, bitidx; 5956 unsigned long pfn, bitidx;
5962 unsigned long value = 1; 5957 unsigned long value = 1;
5963 5958
5964 zone = page_zone(page); 5959 zone = page_zone(page);
5965 pfn = page_to_pfn(page); 5960 pfn = page_to_pfn(page);
5966 bitmap = get_pageblock_bitmap(zone, pfn); 5961 bitmap = get_pageblock_bitmap(zone, pfn);
5967 bitidx = pfn_to_bitidx(zone, pfn); 5962 bitidx = pfn_to_bitidx(zone, pfn);
5968 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 5963 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
5969 5964
5970 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5965 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5971 if (flags & value) 5966 if (flags & value)
5972 __set_bit(bitidx + start_bitidx, bitmap); 5967 __set_bit(bitidx + start_bitidx, bitmap);
5973 else 5968 else
5974 __clear_bit(bitidx + start_bitidx, bitmap); 5969 __clear_bit(bitidx + start_bitidx, bitmap);
5975 } 5970 }
5976 5971
5977 /* 5972 /*
5978 * This function checks whether pageblock includes unmovable pages or not. 5973 * This function checks whether pageblock includes unmovable pages or not.
5979 * If @count is not zero, it is okay to include less @count unmovable pages 5974 * If @count is not zero, it is okay to include less @count unmovable pages
5980 * 5975 *
5981 * PageLRU check without isolation or lru_lock could race so that 5976 * PageLRU check without isolation or lru_lock could race so that
5982 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5977 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5983 * expect this function should be exact. 5978 * expect this function should be exact.
5984 */ 5979 */
5985 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 5980 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5986 bool skip_hwpoisoned_pages) 5981 bool skip_hwpoisoned_pages)
5987 { 5982 {
5988 unsigned long pfn, iter, found; 5983 unsigned long pfn, iter, found;
5989 int mt; 5984 int mt;
5990 5985
5991 /* 5986 /*
5992 * For avoiding noise data, lru_add_drain_all() should be called 5987 * For avoiding noise data, lru_add_drain_all() should be called
5993 * If ZONE_MOVABLE, the zone never contains unmovable pages 5988 * If ZONE_MOVABLE, the zone never contains unmovable pages
5994 */ 5989 */
5995 if (zone_idx(zone) == ZONE_MOVABLE) 5990 if (zone_idx(zone) == ZONE_MOVABLE)
5996 return false; 5991 return false;
5997 mt = get_pageblock_migratetype(page); 5992 mt = get_pageblock_migratetype(page);
5998 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5993 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5999 return false; 5994 return false;
6000 5995
6001 pfn = page_to_pfn(page); 5996 pfn = page_to_pfn(page);
6002 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5997 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6003 unsigned long check = pfn + iter; 5998 unsigned long check = pfn + iter;
6004 5999
6005 if (!pfn_valid_within(check)) 6000 if (!pfn_valid_within(check))
6006 continue; 6001 continue;
6007 6002
6008 page = pfn_to_page(check); 6003 page = pfn_to_page(check);
6009 6004
6010 /* 6005 /*
6011 * Hugepages are not in LRU lists, but they're movable. 6006 * Hugepages are not in LRU lists, but they're movable.
6012 * We need not scan over tail pages bacause we don't 6007 * We need not scan over tail pages bacause we don't
6013 * handle each tail page individually in migration. 6008 * handle each tail page individually in migration.
6014 */ 6009 */
6015 if (PageHuge(page)) { 6010 if (PageHuge(page)) {
6016 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6011 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6017 continue; 6012 continue;
6018 } 6013 }
6019 6014
6020 /* 6015 /*
6021 * We can't use page_count without pin a page 6016 * We can't use page_count without pin a page
6022 * because another CPU can free compound page. 6017 * because another CPU can free compound page.
6023 * This check already skips compound tails of THP 6018 * This check already skips compound tails of THP
6024 * because their page->_count is zero at all time. 6019 * because their page->_count is zero at all time.
6025 */ 6020 */
6026 if (!atomic_read(&page->_count)) { 6021 if (!atomic_read(&page->_count)) {
6027 if (PageBuddy(page)) 6022 if (PageBuddy(page))
6028 iter += (1 << page_order(page)) - 1; 6023 iter += (1 << page_order(page)) - 1;
6029 continue; 6024 continue;
6030 } 6025 }
6031 6026
6032 /* 6027 /*
6033 * The HWPoisoned page may be not in buddy system, and 6028 * The HWPoisoned page may be not in buddy system, and
6034 * page_count() is not 0. 6029 * page_count() is not 0.
6035 */ 6030 */
6036 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6031 if (skip_hwpoisoned_pages && PageHWPoison(page))
6037 continue; 6032 continue;
6038 6033
6039 if (!PageLRU(page)) 6034 if (!PageLRU(page))
6040 found++; 6035 found++;
6041 /* 6036 /*
6042 * If there are RECLAIMABLE pages, we need to check it. 6037 * If there are RECLAIMABLE pages, we need to check it.
6043 * But now, memory offline itself doesn't call shrink_slab() 6038 * But now, memory offline itself doesn't call shrink_slab()
6044 * and it still to be fixed. 6039 * and it still to be fixed.
6045 */ 6040 */
6046 /* 6041 /*
6047 * If the page is not RAM, page_count()should be 0. 6042 * If the page is not RAM, page_count()should be 0.
6048 * we don't need more check. This is an _used_ not-movable page. 6043 * we don't need more check. This is an _used_ not-movable page.
6049 * 6044 *
6050 * The problematic thing here is PG_reserved pages. PG_reserved 6045 * The problematic thing here is PG_reserved pages. PG_reserved
6051 * is set to both of a memory hole page and a _used_ kernel 6046 * is set to both of a memory hole page and a _used_ kernel
6052 * page at boot. 6047 * page at boot.
6053 */ 6048 */
6054 if (found > count) 6049 if (found > count)
6055 return true; 6050 return true;
6056 } 6051 }
6057 return false; 6052 return false;
6058 } 6053 }
6059 6054
6060 bool is_pageblock_removable_nolock(struct page *page) 6055 bool is_pageblock_removable_nolock(struct page *page)
6061 { 6056 {
6062 struct zone *zone; 6057 struct zone *zone;
6063 unsigned long pfn; 6058 unsigned long pfn;
6064 6059
6065 /* 6060 /*
6066 * We have to be careful here because we are iterating over memory 6061 * We have to be careful here because we are iterating over memory
6067 * sections which are not zone aware so we might end up outside of 6062 * sections which are not zone aware so we might end up outside of
6068 * the zone but still within the section. 6063 * the zone but still within the section.
6069 * We have to take care about the node as well. If the node is offline 6064 * We have to take care about the node as well. If the node is offline
6070 * its NODE_DATA will be NULL - see page_zone. 6065 * its NODE_DATA will be NULL - see page_zone.
6071 */ 6066 */
6072 if (!node_online(page_to_nid(page))) 6067 if (!node_online(page_to_nid(page)))
6073 return false; 6068 return false;
6074 6069
6075 zone = page_zone(page); 6070 zone = page_zone(page);
6076 pfn = page_to_pfn(page); 6071 pfn = page_to_pfn(page);
6077 if (!zone_spans_pfn(zone, pfn)) 6072 if (!zone_spans_pfn(zone, pfn))
6078 return false; 6073 return false;
6079 6074
6080 return !has_unmovable_pages(zone, page, 0, true); 6075 return !has_unmovable_pages(zone, page, 0, true);
6081 } 6076 }
6082 6077
6083 #ifdef CONFIG_CMA 6078 #ifdef CONFIG_CMA
6084 6079
6085 static unsigned long pfn_max_align_down(unsigned long pfn) 6080 static unsigned long pfn_max_align_down(unsigned long pfn)
6086 { 6081 {
6087 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6082 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6088 pageblock_nr_pages) - 1); 6083 pageblock_nr_pages) - 1);
6089 } 6084 }
6090 6085
6091 static unsigned long pfn_max_align_up(unsigned long pfn) 6086 static unsigned long pfn_max_align_up(unsigned long pfn)
6092 { 6087 {
6093 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6088 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6094 pageblock_nr_pages)); 6089 pageblock_nr_pages));
6095 } 6090 }
6096 6091
6097 /* [start, end) must belong to a single zone. */ 6092 /* [start, end) must belong to a single zone. */
6098 static int __alloc_contig_migrate_range(struct compact_control *cc, 6093 static int __alloc_contig_migrate_range(struct compact_control *cc,
6099 unsigned long start, unsigned long end) 6094 unsigned long start, unsigned long end)
6100 { 6095 {
6101 /* This function is based on compact_zone() from compaction.c. */ 6096 /* This function is based on compact_zone() from compaction.c. */
6102 unsigned long nr_reclaimed; 6097 unsigned long nr_reclaimed;
6103 unsigned long pfn = start; 6098 unsigned long pfn = start;
6104 unsigned int tries = 0; 6099 unsigned int tries = 0;
6105 int ret = 0; 6100 int ret = 0;
6106 6101
6107 migrate_prep(); 6102 migrate_prep();
6108 6103
6109 while (pfn < end || !list_empty(&cc->migratepages)) { 6104 while (pfn < end || !list_empty(&cc->migratepages)) {
6110 if (fatal_signal_pending(current)) { 6105 if (fatal_signal_pending(current)) {
6111 ret = -EINTR; 6106 ret = -EINTR;
6112 break; 6107 break;
6113 } 6108 }
6114 6109
6115 if (list_empty(&cc->migratepages)) { 6110 if (list_empty(&cc->migratepages)) {
6116 cc->nr_migratepages = 0; 6111 cc->nr_migratepages = 0;
6117 pfn = isolate_migratepages_range(cc->zone, cc, 6112 pfn = isolate_migratepages_range(cc->zone, cc,
6118 pfn, end, true); 6113 pfn, end, true);
6119 if (!pfn) { 6114 if (!pfn) {
6120 ret = -EINTR; 6115 ret = -EINTR;
6121 break; 6116 break;
6122 } 6117 }
6123 tries = 0; 6118 tries = 0;
6124 } else if (++tries == 5) { 6119 } else if (++tries == 5) {
6125 ret = ret < 0 ? ret : -EBUSY; 6120 ret = ret < 0 ? ret : -EBUSY;
6126 break; 6121 break;
6127 } 6122 }
6128 6123
6129 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6124 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6130 &cc->migratepages); 6125 &cc->migratepages);
6131 cc->nr_migratepages -= nr_reclaimed; 6126 cc->nr_migratepages -= nr_reclaimed;
6132 6127
6133 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6128 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6134 0, MIGRATE_SYNC, MR_CMA); 6129 0, MIGRATE_SYNC, MR_CMA);
6135 } 6130 }
6136 if (ret < 0) { 6131 if (ret < 0) {
6137 putback_movable_pages(&cc->migratepages); 6132 putback_movable_pages(&cc->migratepages);
6138 return ret; 6133 return ret;
6139 } 6134 }
6140 return 0; 6135 return 0;
6141 } 6136 }
6142 6137
6143 /** 6138 /**
6144 * alloc_contig_range() -- tries to allocate given range of pages 6139 * alloc_contig_range() -- tries to allocate given range of pages
6145 * @start: start PFN to allocate 6140 * @start: start PFN to allocate
6146 * @end: one-past-the-last PFN to allocate 6141 * @end: one-past-the-last PFN to allocate
6147 * @migratetype: migratetype of the underlaying pageblocks (either 6142 * @migratetype: migratetype of the underlaying pageblocks (either
6148 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6143 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6149 * in range must have the same migratetype and it must 6144 * in range must have the same migratetype and it must
6150 * be either of the two. 6145 * be either of the two.
6151 * 6146 *
6152 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6147 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6153 * aligned, however it's the caller's responsibility to guarantee that 6148 * aligned, however it's the caller's responsibility to guarantee that
6154 * we are the only thread that changes migrate type of pageblocks the 6149 * we are the only thread that changes migrate type of pageblocks the
6155 * pages fall in. 6150 * pages fall in.
6156 * 6151 *
6157 * The PFN range must belong to a single zone. 6152 * The PFN range must belong to a single zone.
6158 * 6153 *
6159 * Returns zero on success or negative error code. On success all 6154 * Returns zero on success or negative error code. On success all
6160 * pages which PFN is in [start, end) are allocated for the caller and 6155 * pages which PFN is in [start, end) are allocated for the caller and
6161 * need to be freed with free_contig_range(). 6156 * need to be freed with free_contig_range().
6162 */ 6157 */
6163 int alloc_contig_range(unsigned long start, unsigned long end, 6158 int alloc_contig_range(unsigned long start, unsigned long end,
6164 unsigned migratetype) 6159 unsigned migratetype)
6165 { 6160 {
6166 unsigned long outer_start, outer_end; 6161 unsigned long outer_start, outer_end;
6167 int ret = 0, order; 6162 int ret = 0, order;
6168 6163
6169 struct compact_control cc = { 6164 struct compact_control cc = {
6170 .nr_migratepages = 0, 6165 .nr_migratepages = 0,
6171 .order = -1, 6166 .order = -1,
6172 .zone = page_zone(pfn_to_page(start)), 6167 .zone = page_zone(pfn_to_page(start)),
6173 .sync = true, 6168 .sync = true,
6174 .ignore_skip_hint = true, 6169 .ignore_skip_hint = true,
6175 }; 6170 };
6176 INIT_LIST_HEAD(&cc.migratepages); 6171 INIT_LIST_HEAD(&cc.migratepages);
6177 6172
6178 /* 6173 /*
6179 * What we do here is we mark all pageblocks in range as 6174 * What we do here is we mark all pageblocks in range as
6180 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6175 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6181 * have different sizes, and due to the way page allocator 6176 * have different sizes, and due to the way page allocator
6182 * work, we align the range to biggest of the two pages so 6177 * work, we align the range to biggest of the two pages so
6183 * that page allocator won't try to merge buddies from 6178 * that page allocator won't try to merge buddies from
6184 * different pageblocks and change MIGRATE_ISOLATE to some 6179 * different pageblocks and change MIGRATE_ISOLATE to some
6185 * other migration type. 6180 * other migration type.
6186 * 6181 *
6187 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6182 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6188 * migrate the pages from an unaligned range (ie. pages that 6183 * migrate the pages from an unaligned range (ie. pages that
6189 * we are interested in). This will put all the pages in 6184 * we are interested in). This will put all the pages in
6190 * range back to page allocator as MIGRATE_ISOLATE. 6185 * range back to page allocator as MIGRATE_ISOLATE.
6191 * 6186 *
6192 * When this is done, we take the pages in range from page 6187 * When this is done, we take the pages in range from page
6193 * allocator removing them from the buddy system. This way 6188 * allocator removing them from the buddy system. This way
6194 * page allocator will never consider using them. 6189 * page allocator will never consider using them.
6195 * 6190 *
6196 * This lets us mark the pageblocks back as 6191 * This lets us mark the pageblocks back as
6197 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6192 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6198 * aligned range but not in the unaligned, original range are 6193 * aligned range but not in the unaligned, original range are
6199 * put back to page allocator so that buddy can use them. 6194 * put back to page allocator so that buddy can use them.
6200 */ 6195 */
6201 6196
6202 ret = start_isolate_page_range(pfn_max_align_down(start), 6197 ret = start_isolate_page_range(pfn_max_align_down(start),
6203 pfn_max_align_up(end), migratetype, 6198 pfn_max_align_up(end), migratetype,
6204 false); 6199 false);
6205 if (ret) 6200 if (ret)
6206 return ret; 6201 return ret;
6207 6202
6208 ret = __alloc_contig_migrate_range(&cc, start, end); 6203 ret = __alloc_contig_migrate_range(&cc, start, end);
6209 if (ret) 6204 if (ret)
6210 goto done; 6205 goto done;
6211 6206
6212 /* 6207 /*
6213 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6208 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6214 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6209 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6215 * more, all pages in [start, end) are free in page allocator. 6210 * more, all pages in [start, end) are free in page allocator.
6216 * What we are going to do is to allocate all pages from 6211 * What we are going to do is to allocate all pages from
6217 * [start, end) (that is remove them from page allocator). 6212 * [start, end) (that is remove them from page allocator).
6218 * 6213 *
6219 * The only problem is that pages at the beginning and at the 6214 * The only problem is that pages at the beginning and at the
6220 * end of interesting range may be not aligned with pages that 6215 * end of interesting range may be not aligned with pages that
6221 * page allocator holds, ie. they can be part of higher order 6216 * page allocator holds, ie. they can be part of higher order
6222 * pages. Because of this, we reserve the bigger range and 6217 * pages. Because of this, we reserve the bigger range and
6223 * once this is done free the pages we are not interested in. 6218 * once this is done free the pages we are not interested in.
6224 * 6219 *
6225 * We don't have to hold zone->lock here because the pages are 6220 * We don't have to hold zone->lock here because the pages are
6226 * isolated thus they won't get removed from buddy. 6221 * isolated thus they won't get removed from buddy.
6227 */ 6222 */
6228 6223
6229 lru_add_drain_all(); 6224 lru_add_drain_all();
6230 drain_all_pages(); 6225 drain_all_pages();
6231 6226
6232 order = 0; 6227 order = 0;
6233 outer_start = start; 6228 outer_start = start;
6234 while (!PageBuddy(pfn_to_page(outer_start))) { 6229 while (!PageBuddy(pfn_to_page(outer_start))) {
6235 if (++order >= MAX_ORDER) { 6230 if (++order >= MAX_ORDER) {
6236 ret = -EBUSY; 6231 ret = -EBUSY;
6237 goto done; 6232 goto done;
6238 } 6233 }
6239 outer_start &= ~0UL << order; 6234 outer_start &= ~0UL << order;
6240 } 6235 }
6241 6236
6242 /* Make sure the range is really isolated. */ 6237 /* Make sure the range is really isolated. */
6243 if (test_pages_isolated(outer_start, end, false)) { 6238 if (test_pages_isolated(outer_start, end, false)) {
6244 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6239 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6245 outer_start, end); 6240 outer_start, end);
6246 ret = -EBUSY; 6241 ret = -EBUSY;
6247 goto done; 6242 goto done;
6248 } 6243 }
6249 6244
6250 6245
6251 /* Grab isolated pages from freelists. */ 6246 /* Grab isolated pages from freelists. */
6252 outer_end = isolate_freepages_range(&cc, outer_start, end); 6247 outer_end = isolate_freepages_range(&cc, outer_start, end);
6253 if (!outer_end) { 6248 if (!outer_end) {
6254 ret = -EBUSY; 6249 ret = -EBUSY;
6255 goto done; 6250 goto done;
6256 } 6251 }
6257 6252
6258 /* Free head and tail (if any) */ 6253 /* Free head and tail (if any) */
6259 if (start != outer_start) 6254 if (start != outer_start)
6260 free_contig_range(outer_start, start - outer_start); 6255 free_contig_range(outer_start, start - outer_start);
6261 if (end != outer_end) 6256 if (end != outer_end)
6262 free_contig_range(end, outer_end - end); 6257 free_contig_range(end, outer_end - end);
6263 6258
6264 done: 6259 done:
6265 undo_isolate_page_range(pfn_max_align_down(start), 6260 undo_isolate_page_range(pfn_max_align_down(start),
6266 pfn_max_align_up(end), migratetype); 6261 pfn_max_align_up(end), migratetype);
6267 return ret; 6262 return ret;
6268 } 6263 }
6269 6264
6270 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6265 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6271 { 6266 {
6272 unsigned int count = 0; 6267 unsigned int count = 0;
6273 6268
6274 for (; nr_pages--; pfn++) { 6269 for (; nr_pages--; pfn++) {
6275 struct page *page = pfn_to_page(pfn); 6270 struct page *page = pfn_to_page(pfn);
6276 6271
6277 count += page_count(page) != 1; 6272 count += page_count(page) != 1;
6278 __free_page(page); 6273 __free_page(page);
6279 } 6274 }
6280 WARN(count != 0, "%d pages are still in use!\n", count); 6275 WARN(count != 0, "%d pages are still in use!\n", count);
6281 } 6276 }
6282 #endif 6277 #endif
6283 6278
6284 #ifdef CONFIG_MEMORY_HOTPLUG 6279 #ifdef CONFIG_MEMORY_HOTPLUG
6285 /* 6280 /*
6286 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6281 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6287 * page high values need to be recalulated. 6282 * page high values need to be recalulated.
6288 */ 6283 */
6289 void __meminit zone_pcp_update(struct zone *zone) 6284 void __meminit zone_pcp_update(struct zone *zone)
6290 { 6285 {
6291 unsigned cpu; 6286 unsigned cpu;
6292 mutex_lock(&pcp_batch_high_lock); 6287 mutex_lock(&pcp_batch_high_lock);
6293 for_each_possible_cpu(cpu) 6288 for_each_possible_cpu(cpu)
6294 pageset_set_high_and_batch(zone, 6289 pageset_set_high_and_batch(zone,
6295 per_cpu_ptr(zone->pageset, cpu)); 6290 per_cpu_ptr(zone->pageset, cpu));
6296 mutex_unlock(&pcp_batch_high_lock); 6291 mutex_unlock(&pcp_batch_high_lock);
6297 } 6292 }
6298 #endif 6293 #endif
6299 6294
6300 void zone_pcp_reset(struct zone *zone) 6295 void zone_pcp_reset(struct zone *zone)
6301 { 6296 {
6302 unsigned long flags; 6297 unsigned long flags;
6303 int cpu; 6298 int cpu;
6304 struct per_cpu_pageset *pset; 6299 struct per_cpu_pageset *pset;
6305 6300
6306 /* avoid races with drain_pages() */ 6301 /* avoid races with drain_pages() */
6307 local_irq_save(flags); 6302 local_irq_save(flags);
6308 if (zone->pageset != &boot_pageset) { 6303 if (zone->pageset != &boot_pageset) {
6309 for_each_online_cpu(cpu) { 6304 for_each_online_cpu(cpu) {
6310 pset = per_cpu_ptr(zone->pageset, cpu); 6305 pset = per_cpu_ptr(zone->pageset, cpu);
6311 drain_zonestat(zone, pset); 6306 drain_zonestat(zone, pset);
6312 } 6307 }
6313 free_percpu(zone->pageset); 6308 free_percpu(zone->pageset);
6314 zone->pageset = &boot_pageset; 6309 zone->pageset = &boot_pageset;
6315 } 6310 }
6316 local_irq_restore(flags); 6311 local_irq_restore(flags);
6317 } 6312 }
6318 6313
6319 #ifdef CONFIG_MEMORY_HOTREMOVE 6314 #ifdef CONFIG_MEMORY_HOTREMOVE
6320 /* 6315 /*
6321 * All pages in the range must be isolated before calling this. 6316 * All pages in the range must be isolated before calling this.
6322 */ 6317 */
6323 void 6318 void
6324 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6319 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6325 { 6320 {
6326 struct page *page; 6321 struct page *page;
6327 struct zone *zone; 6322 struct zone *zone;
6328 int order, i; 6323 int order, i;
6329 unsigned long pfn; 6324 unsigned long pfn;
6330 unsigned long flags; 6325 unsigned long flags;
6331 /* find the first valid pfn */ 6326 /* find the first valid pfn */
6332 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6327 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6333 if (pfn_valid(pfn)) 6328 if (pfn_valid(pfn))
6334 break; 6329 break;
6335 if (pfn == end_pfn) 6330 if (pfn == end_pfn)
6336 return; 6331 return;
6337 zone = page_zone(pfn_to_page(pfn)); 6332 zone = page_zone(pfn_to_page(pfn));
6338 spin_lock_irqsave(&zone->lock, flags); 6333 spin_lock_irqsave(&zone->lock, flags);
6339 pfn = start_pfn; 6334 pfn = start_pfn;
6340 while (pfn < end_pfn) { 6335 while (pfn < end_pfn) {
6341 if (!pfn_valid(pfn)) { 6336 if (!pfn_valid(pfn)) {
6342 pfn++; 6337 pfn++;
6343 continue; 6338 continue;
6344 } 6339 }
6345 page = pfn_to_page(pfn); 6340 page = pfn_to_page(pfn);
6346 /* 6341 /*
6347 * The HWPoisoned page may be not in buddy system, and 6342 * The HWPoisoned page may be not in buddy system, and
6348 * page_count() is not 0. 6343 * page_count() is not 0.
6349 */ 6344 */
6350 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6345 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6351 pfn++; 6346 pfn++;
6352 SetPageReserved(page); 6347 SetPageReserved(page);
6353 continue; 6348 continue;
6354 } 6349 }
6355 6350
6356 BUG_ON(page_count(page)); 6351 BUG_ON(page_count(page));
6357 BUG_ON(!PageBuddy(page)); 6352 BUG_ON(!PageBuddy(page));
6358 order = page_order(page); 6353 order = page_order(page);
6359 #ifdef CONFIG_DEBUG_VM 6354 #ifdef CONFIG_DEBUG_VM
6360 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6355 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6361 pfn, 1 << order, end_pfn); 6356 pfn, 1 << order, end_pfn);
6362 #endif 6357 #endif
6363 list_del(&page->lru); 6358 list_del(&page->lru);
6364 rmv_page_order(page); 6359 rmv_page_order(page);
6365 zone->free_area[order].nr_free--; 6360 zone->free_area[order].nr_free--;
6366 for (i = 0; i < (1 << order); i++) 6361 for (i = 0; i < (1 << order); i++)
6367 SetPageReserved((page+i)); 6362 SetPageReserved((page+i));
6368 pfn += (1 << order); 6363 pfn += (1 << order);
6369 } 6364 }
6370 spin_unlock_irqrestore(&zone->lock, flags); 6365 spin_unlock_irqrestore(&zone->lock, flags);
6371 } 6366 }
6372 #endif 6367 #endif
6373 6368
6374 #ifdef CONFIG_MEMORY_FAILURE 6369 #ifdef CONFIG_MEMORY_FAILURE
6375 bool is_free_buddy_page(struct page *page) 6370 bool is_free_buddy_page(struct page *page)
6376 { 6371 {
6377 struct zone *zone = page_zone(page); 6372 struct zone *zone = page_zone(page);
6378 unsigned long pfn = page_to_pfn(page); 6373 unsigned long pfn = page_to_pfn(page);
6379 unsigned long flags; 6374 unsigned long flags;
6380 int order; 6375 int order;
6381 6376
6382 spin_lock_irqsave(&zone->lock, flags); 6377 spin_lock_irqsave(&zone->lock, flags);
6383 for (order = 0; order < MAX_ORDER; order++) { 6378 for (order = 0; order < MAX_ORDER; order++) {
6384 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6379 struct page *page_head = page - (pfn & ((1 << order) - 1));
6385 6380
6386 if (PageBuddy(page_head) && page_order(page_head) >= order) 6381 if (PageBuddy(page_head) && page_order(page_head) >= order)
6387 break; 6382 break;
6388 } 6383 }
6389 spin_unlock_irqrestore(&zone->lock, flags); 6384 spin_unlock_irqrestore(&zone->lock, flags);
6390 6385
6391 return order < MAX_ORDER; 6386 return order < MAX_ORDER;
6392 } 6387 }
6393 #endif 6388 #endif
6394 6389
6395 static const struct trace_print_flags pageflag_names[] = { 6390 static const struct trace_print_flags pageflag_names[] = {
6396 {1UL << PG_locked, "locked" }, 6391 {1UL << PG_locked, "locked" },
6397 {1UL << PG_error, "error" }, 6392 {1UL << PG_error, "error" },
6398 {1UL << PG_referenced, "referenced" }, 6393 {1UL << PG_referenced, "referenced" },
6399 {1UL << PG_uptodate, "uptodate" }, 6394 {1UL << PG_uptodate, "uptodate" },
6400 {1UL << PG_dirty, "dirty" }, 6395 {1UL << PG_dirty, "dirty" },
6401 {1UL << PG_lru, "lru" }, 6396 {1UL << PG_lru, "lru" },
6402 {1UL << PG_active, "active" }, 6397 {1UL << PG_active, "active" },
6403 {1UL << PG_slab, "slab" }, 6398 {1UL << PG_slab, "slab" },
6404 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6399 {1UL << PG_owner_priv_1, "owner_priv_1" },
6405 {1UL << PG_arch_1, "arch_1" }, 6400 {1UL << PG_arch_1, "arch_1" },
6406 {1UL << PG_reserved, "reserved" }, 6401 {1UL << PG_reserved, "reserved" },
6407 {1UL << PG_private, "private" }, 6402 {1UL << PG_private, "private" },
6408 {1UL << PG_private_2, "private_2" }, 6403 {1UL << PG_private_2, "private_2" },
6409 {1UL << PG_writeback, "writeback" }, 6404 {1UL << PG_writeback, "writeback" },
6410 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6405 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6411 {1UL << PG_head, "head" }, 6406 {1UL << PG_head, "head" },
6412 {1UL << PG_tail, "tail" }, 6407 {1UL << PG_tail, "tail" },
6413 #else 6408 #else
6414 {1UL << PG_compound, "compound" }, 6409 {1UL << PG_compound, "compound" },
6415 #endif 6410 #endif
6416 {1UL << PG_swapcache, "swapcache" }, 6411 {1UL << PG_swapcache, "swapcache" },
6417 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6412 {1UL << PG_mappedtodisk, "mappedtodisk" },
6418 {1UL << PG_reclaim, "reclaim" }, 6413 {1UL << PG_reclaim, "reclaim" },
6419 {1UL << PG_swapbacked, "swapbacked" }, 6414 {1UL << PG_swapbacked, "swapbacked" },
6420 {1UL << PG_unevictable, "unevictable" }, 6415 {1UL << PG_unevictable, "unevictable" },
6421 #ifdef CONFIG_MMU 6416 #ifdef CONFIG_MMU
6422 {1UL << PG_mlocked, "mlocked" }, 6417 {1UL << PG_mlocked, "mlocked" },
6423 #endif 6418 #endif
6424 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6419 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6425 {1UL << PG_uncached, "uncached" }, 6420 {1UL << PG_uncached, "uncached" },
6426 #endif 6421 #endif
6427 #ifdef CONFIG_MEMORY_FAILURE 6422 #ifdef CONFIG_MEMORY_FAILURE
6428 {1UL << PG_hwpoison, "hwpoison" }, 6423 {1UL << PG_hwpoison, "hwpoison" },
6429 #endif 6424 #endif
6430 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6425 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6431 {1UL << PG_compound_lock, "compound_lock" }, 6426 {1UL << PG_compound_lock, "compound_lock" },
6432 #endif 6427 #endif
6433 }; 6428 };
6434 6429
6435 static void dump_page_flags(unsigned long flags) 6430 static void dump_page_flags(unsigned long flags)
6436 { 6431 {
6437 const char *delim = ""; 6432 const char *delim = "";
6438 unsigned long mask; 6433 unsigned long mask;
6439 int i; 6434 int i;
6440 6435
6441 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6436 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6442 6437
6443 printk(KERN_ALERT "page flags: %#lx(", flags); 6438 printk(KERN_ALERT "page flags: %#lx(", flags);
6444 6439
6445 /* remove zone id */ 6440 /* remove zone id */
6446 flags &= (1UL << NR_PAGEFLAGS) - 1; 6441 flags &= (1UL << NR_PAGEFLAGS) - 1;
6447 6442
6448 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6443 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6449 6444
6450 mask = pageflag_names[i].mask; 6445 mask = pageflag_names[i].mask;
6451 if ((flags & mask) != mask) 6446 if ((flags & mask) != mask)
6452 continue; 6447 continue;
6453 6448
6454 flags &= ~mask; 6449 flags &= ~mask;
6455 printk("%s%s", delim, pageflag_names[i].name); 6450 printk("%s%s", delim, pageflag_names[i].name);
6456 delim = "|"; 6451 delim = "|";
6457 } 6452 }
6458 6453
6459 /* check for left over flags */ 6454 /* check for left over flags */
6460 if (flags) 6455 if (flags)
6461 printk("%s%#lx", delim, flags); 6456 printk("%s%#lx", delim, flags);
6462 6457
6463 printk(")\n"); 6458 printk(")\n");
6464 } 6459 }
6465 6460
6466 void dump_page(struct page *page) 6461 void dump_page(struct page *page)
6467 { 6462 {
6468 printk(KERN_ALERT 6463 printk(KERN_ALERT
6469 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6464 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6470 page, atomic_read(&page->_count), page_mapcount(page), 6465 page, atomic_read(&page->_count), page_mapcount(page),
6471 page->mapping, page->index); 6466 page->mapping, page->index);