Commit 51aad0a51582e4147380137ba34785663a1b5f93

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent fc915114c8

mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered

commit f7b5d647946aae1647bf5cd26c16b3a793c1ac49 upstream.

The purpose of numa_zonelist_order=zone is to preserve lower zones for
use with 32-bit devices.  If locality is preferred then the
numa_zonelist_order=node policy should be used.

Unfortunately, the fair zone allocation policy overrides this by
skipping zones on remote nodes until the lower one is found.  While this
makes sense from a page aging and performance perspective, it breaks the
expected zonelist policy.  This patch restores the expected behaviour
for zone-list ordering.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 1 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 #define MIN_PERCPU_PAGELIST_FRACTION (8) 72 #define MIN_PERCPU_PAGELIST_FRACTION (8)
73 73
74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75 DEFINE_PER_CPU(int, numa_node); 75 DEFINE_PER_CPU(int, numa_node);
76 EXPORT_PER_CPU_SYMBOL(numa_node); 76 EXPORT_PER_CPU_SYMBOL(numa_node);
77 #endif 77 #endif
78 78
79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
80 /* 80 /*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>. 84 * defined in <linux/topology.h>.
85 */ 85 */
86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 87 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
88 #endif 88 #endif
89 89
90 /* 90 /*
91 * Array of node states. 91 * Array of node states.
92 */ 92 */
93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
94 [N_POSSIBLE] = NODE_MASK_ALL, 94 [N_POSSIBLE] = NODE_MASK_ALL,
95 [N_ONLINE] = { { [0] = 1UL } }, 95 [N_ONLINE] = { { [0] = 1UL } },
96 #ifndef CONFIG_NUMA 96 #ifndef CONFIG_NUMA
97 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 97 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
98 #ifdef CONFIG_HIGHMEM 98 #ifdef CONFIG_HIGHMEM
99 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 99 [N_HIGH_MEMORY] = { { [0] = 1UL } },
100 #endif 100 #endif
101 #ifdef CONFIG_MOVABLE_NODE 101 #ifdef CONFIG_MOVABLE_NODE
102 [N_MEMORY] = { { [0] = 1UL } }, 102 [N_MEMORY] = { { [0] = 1UL } },
103 #endif 103 #endif
104 [N_CPU] = { { [0] = 1UL } }, 104 [N_CPU] = { { [0] = 1UL } },
105 #endif /* NUMA */ 105 #endif /* NUMA */
106 }; 106 };
107 EXPORT_SYMBOL(node_states); 107 EXPORT_SYMBOL(node_states);
108 108
109 /* Protect totalram_pages and zone->managed_pages */ 109 /* Protect totalram_pages and zone->managed_pages */
110 static DEFINE_SPINLOCK(managed_page_count_lock); 110 static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112 unsigned long totalram_pages __read_mostly; 112 unsigned long totalram_pages __read_mostly;
113 unsigned long totalreserve_pages __read_mostly; 113 unsigned long totalreserve_pages __read_mostly;
114 /* 114 /*
115 * When calculating the number of globally allowed dirty pages, there 115 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 116 * is a certain number of per-zone reserves that should not be
117 * considered dirtyable memory. This is the sum of those reserves 117 * considered dirtyable memory. This is the sum of those reserves
118 * over all existing zones that contribute dirtyable memory. 118 * over all existing zones that contribute dirtyable memory.
119 */ 119 */
120 unsigned long dirty_balance_reserve __read_mostly; 120 unsigned long dirty_balance_reserve __read_mostly;
121 121
122 int percpu_pagelist_fraction; 122 int percpu_pagelist_fraction;
123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
124 124
125 #ifdef CONFIG_PM_SLEEP 125 #ifdef CONFIG_PM_SLEEP
126 /* 126 /*
127 * The following functions are used by the suspend/hibernate code to temporarily 127 * The following functions are used by the suspend/hibernate code to temporarily
128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
129 * while devices are suspended. To avoid races with the suspend/hibernate code, 129 * while devices are suspended. To avoid races with the suspend/hibernate code,
130 * they should always be called with pm_mutex held (gfp_allowed_mask also should 130 * they should always be called with pm_mutex held (gfp_allowed_mask also should
131 * only be modified with pm_mutex held, unless the suspend/hibernate code is 131 * only be modified with pm_mutex held, unless the suspend/hibernate code is
132 * guaranteed not to run in parallel with that modification). 132 * guaranteed not to run in parallel with that modification).
133 */ 133 */
134 134
135 static gfp_t saved_gfp_mask; 135 static gfp_t saved_gfp_mask;
136 136
137 void pm_restore_gfp_mask(void) 137 void pm_restore_gfp_mask(void)
138 { 138 {
139 WARN_ON(!mutex_is_locked(&pm_mutex)); 139 WARN_ON(!mutex_is_locked(&pm_mutex));
140 if (saved_gfp_mask) { 140 if (saved_gfp_mask) {
141 gfp_allowed_mask = saved_gfp_mask; 141 gfp_allowed_mask = saved_gfp_mask;
142 saved_gfp_mask = 0; 142 saved_gfp_mask = 0;
143 } 143 }
144 } 144 }
145 145
146 void pm_restrict_gfp_mask(void) 146 void pm_restrict_gfp_mask(void)
147 { 147 {
148 WARN_ON(!mutex_is_locked(&pm_mutex)); 148 WARN_ON(!mutex_is_locked(&pm_mutex));
149 WARN_ON(saved_gfp_mask); 149 WARN_ON(saved_gfp_mask);
150 saved_gfp_mask = gfp_allowed_mask; 150 saved_gfp_mask = gfp_allowed_mask;
151 gfp_allowed_mask &= ~GFP_IOFS; 151 gfp_allowed_mask &= ~GFP_IOFS;
152 } 152 }
153 153
154 bool pm_suspended_storage(void) 154 bool pm_suspended_storage(void)
155 { 155 {
156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
157 return false; 157 return false;
158 return true; 158 return true;
159 } 159 }
160 #endif /* CONFIG_PM_SLEEP */ 160 #endif /* CONFIG_PM_SLEEP */
161 161
162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
163 int pageblock_order __read_mostly; 163 int pageblock_order __read_mostly;
164 #endif 164 #endif
165 165
166 static void __free_pages_ok(struct page *page, unsigned int order); 166 static void __free_pages_ok(struct page *page, unsigned int order);
167 167
168 /* 168 /*
169 * results with 256, 32 in the lowmem_reserve sysctl: 169 * results with 256, 32 in the lowmem_reserve sysctl:
170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
171 * 1G machine -> (16M dma, 784M normal, 224M high) 171 * 1G machine -> (16M dma, 784M normal, 224M high)
172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
175 * 175 *
176 * TBD: should special case ZONE_DMA32 machines here - in those we normally 176 * TBD: should special case ZONE_DMA32 machines here - in those we normally
177 * don't need any ZONE_NORMAL reservation 177 * don't need any ZONE_NORMAL reservation
178 */ 178 */
179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
180 #ifdef CONFIG_ZONE_DMA 180 #ifdef CONFIG_ZONE_DMA
181 256, 181 256,
182 #endif 182 #endif
183 #ifdef CONFIG_ZONE_DMA32 183 #ifdef CONFIG_ZONE_DMA32
184 256, 184 256,
185 #endif 185 #endif
186 #ifdef CONFIG_HIGHMEM 186 #ifdef CONFIG_HIGHMEM
187 32, 187 32,
188 #endif 188 #endif
189 32, 189 32,
190 }; 190 };
191 191
192 EXPORT_SYMBOL(totalram_pages); 192 EXPORT_SYMBOL(totalram_pages);
193 193
194 static char * const zone_names[MAX_NR_ZONES] = { 194 static char * const zone_names[MAX_NR_ZONES] = {
195 #ifdef CONFIG_ZONE_DMA 195 #ifdef CONFIG_ZONE_DMA
196 "DMA", 196 "DMA",
197 #endif 197 #endif
198 #ifdef CONFIG_ZONE_DMA32 198 #ifdef CONFIG_ZONE_DMA32
199 "DMA32", 199 "DMA32",
200 #endif 200 #endif
201 "Normal", 201 "Normal",
202 #ifdef CONFIG_HIGHMEM 202 #ifdef CONFIG_HIGHMEM
203 "HighMem", 203 "HighMem",
204 #endif 204 #endif
205 "Movable", 205 "Movable",
206 }; 206 };
207 207
208 int min_free_kbytes = 1024; 208 int min_free_kbytes = 1024;
209 int user_min_free_kbytes; 209 int user_min_free_kbytes;
210 210
211 static unsigned long __meminitdata nr_kernel_pages; 211 static unsigned long __meminitdata nr_kernel_pages;
212 static unsigned long __meminitdata nr_all_pages; 212 static unsigned long __meminitdata nr_all_pages;
213 static unsigned long __meminitdata dma_reserve; 213 static unsigned long __meminitdata dma_reserve;
214 214
215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
218 static unsigned long __initdata required_kernelcore; 218 static unsigned long __initdata required_kernelcore;
219 static unsigned long __initdata required_movablecore; 219 static unsigned long __initdata required_movablecore;
220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
221 221
222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
223 int movable_zone; 223 int movable_zone;
224 EXPORT_SYMBOL(movable_zone); 224 EXPORT_SYMBOL(movable_zone);
225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
226 226
227 #if MAX_NUMNODES > 1 227 #if MAX_NUMNODES > 1
228 int nr_node_ids __read_mostly = MAX_NUMNODES; 228 int nr_node_ids __read_mostly = MAX_NUMNODES;
229 int nr_online_nodes __read_mostly = 1; 229 int nr_online_nodes __read_mostly = 1;
230 EXPORT_SYMBOL(nr_node_ids); 230 EXPORT_SYMBOL(nr_node_ids);
231 EXPORT_SYMBOL(nr_online_nodes); 231 EXPORT_SYMBOL(nr_online_nodes);
232 #endif 232 #endif
233 233
234 int page_group_by_mobility_disabled __read_mostly; 234 int page_group_by_mobility_disabled __read_mostly;
235 235
236 void set_pageblock_migratetype(struct page *page, int migratetype) 236 void set_pageblock_migratetype(struct page *page, int migratetype)
237 { 237 {
238 238
239 if (unlikely(page_group_by_mobility_disabled)) 239 if (unlikely(page_group_by_mobility_disabled))
240 migratetype = MIGRATE_UNMOVABLE; 240 migratetype = MIGRATE_UNMOVABLE;
241 241
242 set_pageblock_flags_group(page, (unsigned long)migratetype, 242 set_pageblock_flags_group(page, (unsigned long)migratetype,
243 PB_migrate, PB_migrate_end); 243 PB_migrate, PB_migrate_end);
244 } 244 }
245 245
246 bool oom_killer_disabled __read_mostly; 246 bool oom_killer_disabled __read_mostly;
247 247
248 #ifdef CONFIG_DEBUG_VM 248 #ifdef CONFIG_DEBUG_VM
249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
250 { 250 {
251 int ret = 0; 251 int ret = 0;
252 unsigned seq; 252 unsigned seq;
253 unsigned long pfn = page_to_pfn(page); 253 unsigned long pfn = page_to_pfn(page);
254 unsigned long sp, start_pfn; 254 unsigned long sp, start_pfn;
255 255
256 do { 256 do {
257 seq = zone_span_seqbegin(zone); 257 seq = zone_span_seqbegin(zone);
258 start_pfn = zone->zone_start_pfn; 258 start_pfn = zone->zone_start_pfn;
259 sp = zone->spanned_pages; 259 sp = zone->spanned_pages;
260 if (!zone_spans_pfn(zone, pfn)) 260 if (!zone_spans_pfn(zone, pfn))
261 ret = 1; 261 ret = 1;
262 } while (zone_span_seqretry(zone, seq)); 262 } while (zone_span_seqretry(zone, seq));
263 263
264 if (ret) 264 if (ret)
265 pr_err("page %lu outside zone [ %lu - %lu ]\n", 265 pr_err("page %lu outside zone [ %lu - %lu ]\n",
266 pfn, start_pfn, start_pfn + sp); 266 pfn, start_pfn, start_pfn + sp);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 static int page_is_consistent(struct zone *zone, struct page *page) 271 static int page_is_consistent(struct zone *zone, struct page *page)
272 { 272 {
273 if (!pfn_valid_within(page_to_pfn(page))) 273 if (!pfn_valid_within(page_to_pfn(page)))
274 return 0; 274 return 0;
275 if (zone != page_zone(page)) 275 if (zone != page_zone(page))
276 return 0; 276 return 0;
277 277
278 return 1; 278 return 1;
279 } 279 }
280 /* 280 /*
281 * Temporary debugging check for pages not lying within a given zone. 281 * Temporary debugging check for pages not lying within a given zone.
282 */ 282 */
283 static int bad_range(struct zone *zone, struct page *page) 283 static int bad_range(struct zone *zone, struct page *page)
284 { 284 {
285 if (page_outside_zone_boundaries(zone, page)) 285 if (page_outside_zone_boundaries(zone, page))
286 return 1; 286 return 1;
287 if (!page_is_consistent(zone, page)) 287 if (!page_is_consistent(zone, page))
288 return 1; 288 return 1;
289 289
290 return 0; 290 return 0;
291 } 291 }
292 #else 292 #else
293 static inline int bad_range(struct zone *zone, struct page *page) 293 static inline int bad_range(struct zone *zone, struct page *page)
294 { 294 {
295 return 0; 295 return 0;
296 } 296 }
297 #endif 297 #endif
298 298
299 static void bad_page(struct page *page) 299 static void bad_page(struct page *page)
300 { 300 {
301 static unsigned long resume; 301 static unsigned long resume;
302 static unsigned long nr_shown; 302 static unsigned long nr_shown;
303 static unsigned long nr_unshown; 303 static unsigned long nr_unshown;
304 304
305 /* Don't complain about poisoned pages */ 305 /* Don't complain about poisoned pages */
306 if (PageHWPoison(page)) { 306 if (PageHWPoison(page)) {
307 page_mapcount_reset(page); /* remove PageBuddy */ 307 page_mapcount_reset(page); /* remove PageBuddy */
308 return; 308 return;
309 } 309 }
310 310
311 /* 311 /*
312 * Allow a burst of 60 reports, then keep quiet for that minute; 312 * Allow a burst of 60 reports, then keep quiet for that minute;
313 * or allow a steady drip of one report per second. 313 * or allow a steady drip of one report per second.
314 */ 314 */
315 if (nr_shown == 60) { 315 if (nr_shown == 60) {
316 if (time_before(jiffies, resume)) { 316 if (time_before(jiffies, resume)) {
317 nr_unshown++; 317 nr_unshown++;
318 goto out; 318 goto out;
319 } 319 }
320 if (nr_unshown) { 320 if (nr_unshown) {
321 printk(KERN_ALERT 321 printk(KERN_ALERT
322 "BUG: Bad page state: %lu messages suppressed\n", 322 "BUG: Bad page state: %lu messages suppressed\n",
323 nr_unshown); 323 nr_unshown);
324 nr_unshown = 0; 324 nr_unshown = 0;
325 } 325 }
326 nr_shown = 0; 326 nr_shown = 0;
327 } 327 }
328 if (nr_shown++ == 0) 328 if (nr_shown++ == 0)
329 resume = jiffies + 60 * HZ; 329 resume = jiffies + 60 * HZ;
330 330
331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
332 current->comm, page_to_pfn(page)); 332 current->comm, page_to_pfn(page));
333 dump_page(page); 333 dump_page(page);
334 334
335 print_modules(); 335 print_modules();
336 dump_stack(); 336 dump_stack();
337 out: 337 out:
338 /* Leave bad fields for debug, except PageBuddy could make trouble */ 338 /* Leave bad fields for debug, except PageBuddy could make trouble */
339 page_mapcount_reset(page); /* remove PageBuddy */ 339 page_mapcount_reset(page); /* remove PageBuddy */
340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
341 } 341 }
342 342
343 /* 343 /*
344 * Higher-order pages are called "compound pages". They are structured thusly: 344 * Higher-order pages are called "compound pages". They are structured thusly:
345 * 345 *
346 * The first PAGE_SIZE page is called the "head page". 346 * The first PAGE_SIZE page is called the "head page".
347 * 347 *
348 * The remaining PAGE_SIZE pages are called "tail pages". 348 * The remaining PAGE_SIZE pages are called "tail pages".
349 * 349 *
350 * All pages have PG_compound set. All tail pages have their ->first_page 350 * All pages have PG_compound set. All tail pages have their ->first_page
351 * pointing at the head page. 351 * pointing at the head page.
352 * 352 *
353 * The first tail page's ->lru.next holds the address of the compound page's 353 * The first tail page's ->lru.next holds the address of the compound page's
354 * put_page() function. Its ->lru.prev holds the order of allocation. 354 * put_page() function. Its ->lru.prev holds the order of allocation.
355 * This usage means that zero-order pages may not be compound. 355 * This usage means that zero-order pages may not be compound.
356 */ 356 */
357 357
358 static void free_compound_page(struct page *page) 358 static void free_compound_page(struct page *page)
359 { 359 {
360 __free_pages_ok(page, compound_order(page)); 360 __free_pages_ok(page, compound_order(page));
361 } 361 }
362 362
363 void prep_compound_page(struct page *page, unsigned long order) 363 void prep_compound_page(struct page *page, unsigned long order)
364 { 364 {
365 int i; 365 int i;
366 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
367 367
368 set_compound_page_dtor(page, free_compound_page); 368 set_compound_page_dtor(page, free_compound_page);
369 set_compound_order(page, order); 369 set_compound_order(page, order);
370 __SetPageHead(page); 370 __SetPageHead(page);
371 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
372 struct page *p = page + i; 372 struct page *p = page + i;
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 /* Make sure p->first_page is always valid for PageTail() */ 375 /* Make sure p->first_page is always valid for PageTail() */
376 smp_wmb(); 376 smp_wmb();
377 __SetPageTail(p); 377 __SetPageTail(p);
378 } 378 }
379 } 379 }
380 380
381 /* update __split_huge_page_refcount if you change this function */ 381 /* update __split_huge_page_refcount if you change this function */
382 static int destroy_compound_page(struct page *page, unsigned long order) 382 static int destroy_compound_page(struct page *page, unsigned long order)
383 { 383 {
384 int i; 384 int i;
385 int nr_pages = 1 << order; 385 int nr_pages = 1 << order;
386 int bad = 0; 386 int bad = 0;
387 387
388 if (unlikely(compound_order(page) != order)) { 388 if (unlikely(compound_order(page) != order)) {
389 bad_page(page); 389 bad_page(page);
390 bad++; 390 bad++;
391 } 391 }
392 392
393 __ClearPageHead(page); 393 __ClearPageHead(page);
394 394
395 for (i = 1; i < nr_pages; i++) { 395 for (i = 1; i < nr_pages; i++) {
396 struct page *p = page + i; 396 struct page *p = page + i;
397 397
398 if (unlikely(!PageTail(p) || (p->first_page != page))) { 398 if (unlikely(!PageTail(p) || (p->first_page != page))) {
399 bad_page(page); 399 bad_page(page);
400 bad++; 400 bad++;
401 } 401 }
402 __ClearPageTail(p); 402 __ClearPageTail(p);
403 } 403 }
404 404
405 return bad; 405 return bad;
406 } 406 }
407 407
408 static inline void prep_zero_page(struct page *page, unsigned int order, 408 static inline void prep_zero_page(struct page *page, unsigned int order,
409 gfp_t gfp_flags) 409 gfp_t gfp_flags)
410 { 410 {
411 int i; 411 int i;
412 412
413 /* 413 /*
414 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 414 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
415 * and __GFP_HIGHMEM from hard or soft interrupt context. 415 * and __GFP_HIGHMEM from hard or soft interrupt context.
416 */ 416 */
417 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 417 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
418 for (i = 0; i < (1 << order); i++) 418 for (i = 0; i < (1 << order); i++)
419 clear_highpage(page + i); 419 clear_highpage(page + i);
420 } 420 }
421 421
422 #ifdef CONFIG_DEBUG_PAGEALLOC 422 #ifdef CONFIG_DEBUG_PAGEALLOC
423 unsigned int _debug_guardpage_minorder; 423 unsigned int _debug_guardpage_minorder;
424 424
425 static int __init debug_guardpage_minorder_setup(char *buf) 425 static int __init debug_guardpage_minorder_setup(char *buf)
426 { 426 {
427 unsigned long res; 427 unsigned long res;
428 428
429 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 429 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
430 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 430 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
431 return 0; 431 return 0;
432 } 432 }
433 _debug_guardpage_minorder = res; 433 _debug_guardpage_minorder = res;
434 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 434 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
435 return 0; 435 return 0;
436 } 436 }
437 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 437 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
438 438
439 static inline void set_page_guard_flag(struct page *page) 439 static inline void set_page_guard_flag(struct page *page)
440 { 440 {
441 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 441 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
442 } 442 }
443 443
444 static inline void clear_page_guard_flag(struct page *page) 444 static inline void clear_page_guard_flag(struct page *page)
445 { 445 {
446 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 446 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
447 } 447 }
448 #else 448 #else
449 static inline void set_page_guard_flag(struct page *page) { } 449 static inline void set_page_guard_flag(struct page *page) { }
450 static inline void clear_page_guard_flag(struct page *page) { } 450 static inline void clear_page_guard_flag(struct page *page) { }
451 #endif 451 #endif
452 452
453 static inline void set_page_order(struct page *page, unsigned int order) 453 static inline void set_page_order(struct page *page, unsigned int order)
454 { 454 {
455 set_page_private(page, order); 455 set_page_private(page, order);
456 __SetPageBuddy(page); 456 __SetPageBuddy(page);
457 } 457 }
458 458
459 static inline void rmv_page_order(struct page *page) 459 static inline void rmv_page_order(struct page *page)
460 { 460 {
461 __ClearPageBuddy(page); 461 __ClearPageBuddy(page);
462 set_page_private(page, 0); 462 set_page_private(page, 0);
463 } 463 }
464 464
465 /* 465 /*
466 * Locate the struct page for both the matching buddy in our 466 * Locate the struct page for both the matching buddy in our
467 * pair (buddy1) and the combined O(n+1) page they form (page). 467 * pair (buddy1) and the combined O(n+1) page they form (page).
468 * 468 *
469 * 1) Any buddy B1 will have an order O twin B2 which satisfies 469 * 1) Any buddy B1 will have an order O twin B2 which satisfies
470 * the following equation: 470 * the following equation:
471 * B2 = B1 ^ (1 << O) 471 * B2 = B1 ^ (1 << O)
472 * For example, if the starting buddy (buddy2) is #8 its order 472 * For example, if the starting buddy (buddy2) is #8 its order
473 * 1 buddy is #10: 473 * 1 buddy is #10:
474 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 474 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
475 * 475 *
476 * 2) Any buddy B will have an order O+1 parent P which 476 * 2) Any buddy B will have an order O+1 parent P which
477 * satisfies the following equation: 477 * satisfies the following equation:
478 * P = B & ~(1 << O) 478 * P = B & ~(1 << O)
479 * 479 *
480 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 480 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
481 */ 481 */
482 static inline unsigned long 482 static inline unsigned long
483 __find_buddy_index(unsigned long page_idx, unsigned int order) 483 __find_buddy_index(unsigned long page_idx, unsigned int order)
484 { 484 {
485 return page_idx ^ (1 << order); 485 return page_idx ^ (1 << order);
486 } 486 }
487 487
488 /* 488 /*
489 * This function checks whether a page is free && is the buddy 489 * This function checks whether a page is free && is the buddy
490 * we can do coalesce a page and its buddy if 490 * we can do coalesce a page and its buddy if
491 * (a) the buddy is not in a hole && 491 * (a) the buddy is not in a hole &&
492 * (b) the buddy is in the buddy system && 492 * (b) the buddy is in the buddy system &&
493 * (c) a page and its buddy have the same order && 493 * (c) a page and its buddy have the same order &&
494 * (d) a page and its buddy are in the same zone. 494 * (d) a page and its buddy are in the same zone.
495 * 495 *
496 * For recording whether a page is in the buddy system, we set ->_mapcount 496 * For recording whether a page is in the buddy system, we set ->_mapcount
497 * PAGE_BUDDY_MAPCOUNT_VALUE. 497 * PAGE_BUDDY_MAPCOUNT_VALUE.
498 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 498 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
499 * serialized by zone->lock. 499 * serialized by zone->lock.
500 * 500 *
501 * For recording page's order, we use page_private(page). 501 * For recording page's order, we use page_private(page).
502 */ 502 */
503 static inline int page_is_buddy(struct page *page, struct page *buddy, 503 static inline int page_is_buddy(struct page *page, struct page *buddy,
504 unsigned int order) 504 unsigned int order)
505 { 505 {
506 if (!pfn_valid_within(page_to_pfn(buddy))) 506 if (!pfn_valid_within(page_to_pfn(buddy)))
507 return 0; 507 return 0;
508 508
509 if (page_is_guard(buddy) && page_order(buddy) == order) { 509 if (page_is_guard(buddy) && page_order(buddy) == order) {
510 VM_BUG_ON(page_count(buddy) != 0); 510 VM_BUG_ON(page_count(buddy) != 0);
511 511
512 if (page_zone_id(page) != page_zone_id(buddy)) 512 if (page_zone_id(page) != page_zone_id(buddy))
513 return 0; 513 return 0;
514 514
515 return 1; 515 return 1;
516 } 516 }
517 517
518 if (PageBuddy(buddy) && page_order(buddy) == order) { 518 if (PageBuddy(buddy) && page_order(buddy) == order) {
519 VM_BUG_ON(page_count(buddy) != 0); 519 VM_BUG_ON(page_count(buddy) != 0);
520 520
521 /* 521 /*
522 * zone check is done late to avoid uselessly 522 * zone check is done late to avoid uselessly
523 * calculating zone/node ids for pages that could 523 * calculating zone/node ids for pages that could
524 * never merge. 524 * never merge.
525 */ 525 */
526 if (page_zone_id(page) != page_zone_id(buddy)) 526 if (page_zone_id(page) != page_zone_id(buddy))
527 return 0; 527 return 0;
528 528
529 return 1; 529 return 1;
530 } 530 }
531 return 0; 531 return 0;
532 } 532 }
533 533
534 /* 534 /*
535 * Freeing function for a buddy system allocator. 535 * Freeing function for a buddy system allocator.
536 * 536 *
537 * The concept of a buddy system is to maintain direct-mapped table 537 * The concept of a buddy system is to maintain direct-mapped table
538 * (containing bit values) for memory blocks of various "orders". 538 * (containing bit values) for memory blocks of various "orders".
539 * The bottom level table contains the map for the smallest allocatable 539 * The bottom level table contains the map for the smallest allocatable
540 * units of memory (here, pages), and each level above it describes 540 * units of memory (here, pages), and each level above it describes
541 * pairs of units from the levels below, hence, "buddies". 541 * pairs of units from the levels below, hence, "buddies".
542 * At a high level, all that happens here is marking the table entry 542 * At a high level, all that happens here is marking the table entry
543 * at the bottom level available, and propagating the changes upward 543 * at the bottom level available, and propagating the changes upward
544 * as necessary, plus some accounting needed to play nicely with other 544 * as necessary, plus some accounting needed to play nicely with other
545 * parts of the VM system. 545 * parts of the VM system.
546 * At each level, we keep a list of pages, which are heads of continuous 546 * At each level, we keep a list of pages, which are heads of continuous
547 * free pages of length of (1 << order) and marked with _mapcount 547 * free pages of length of (1 << order) and marked with _mapcount
548 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 548 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
549 * field. 549 * field.
550 * So when we are allocating or freeing one, we can derive the state of the 550 * So when we are allocating or freeing one, we can derive the state of the
551 * other. That is, if we allocate a small block, and both were 551 * other. That is, if we allocate a small block, and both were
552 * free, the remainder of the region must be split into blocks. 552 * free, the remainder of the region must be split into blocks.
553 * If a block is freed, and its buddy is also free, then this 553 * If a block is freed, and its buddy is also free, then this
554 * triggers coalescing into a block of larger size. 554 * triggers coalescing into a block of larger size.
555 * 555 *
556 * -- nyc 556 * -- nyc
557 */ 557 */
558 558
559 static inline void __free_one_page(struct page *page, 559 static inline void __free_one_page(struct page *page,
560 unsigned long pfn, 560 unsigned long pfn,
561 struct zone *zone, unsigned int order, 561 struct zone *zone, unsigned int order,
562 int migratetype) 562 int migratetype)
563 { 563 {
564 unsigned long page_idx; 564 unsigned long page_idx;
565 unsigned long combined_idx; 565 unsigned long combined_idx;
566 unsigned long uninitialized_var(buddy_idx); 566 unsigned long uninitialized_var(buddy_idx);
567 struct page *buddy; 567 struct page *buddy;
568 568
569 VM_BUG_ON(!zone_is_initialized(zone)); 569 VM_BUG_ON(!zone_is_initialized(zone));
570 570
571 if (unlikely(PageCompound(page))) 571 if (unlikely(PageCompound(page)))
572 if (unlikely(destroy_compound_page(page, order))) 572 if (unlikely(destroy_compound_page(page, order)))
573 return; 573 return;
574 574
575 VM_BUG_ON(migratetype == -1); 575 VM_BUG_ON(migratetype == -1);
576 576
577 page_idx = pfn & ((1 << MAX_ORDER) - 1); 577 page_idx = pfn & ((1 << MAX_ORDER) - 1);
578 578
579 VM_BUG_ON(page_idx & ((1 << order) - 1)); 579 VM_BUG_ON(page_idx & ((1 << order) - 1));
580 VM_BUG_ON(bad_range(zone, page)); 580 VM_BUG_ON(bad_range(zone, page));
581 581
582 while (order < MAX_ORDER-1) { 582 while (order < MAX_ORDER-1) {
583 buddy_idx = __find_buddy_index(page_idx, order); 583 buddy_idx = __find_buddy_index(page_idx, order);
584 buddy = page + (buddy_idx - page_idx); 584 buddy = page + (buddy_idx - page_idx);
585 if (!page_is_buddy(page, buddy, order)) 585 if (!page_is_buddy(page, buddy, order))
586 break; 586 break;
587 /* 587 /*
588 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 588 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
589 * merge with it and move up one order. 589 * merge with it and move up one order.
590 */ 590 */
591 if (page_is_guard(buddy)) { 591 if (page_is_guard(buddy)) {
592 clear_page_guard_flag(buddy); 592 clear_page_guard_flag(buddy);
593 set_page_private(page, 0); 593 set_page_private(page, 0);
594 __mod_zone_freepage_state(zone, 1 << order, 594 __mod_zone_freepage_state(zone, 1 << order,
595 migratetype); 595 migratetype);
596 } else { 596 } else {
597 list_del(&buddy->lru); 597 list_del(&buddy->lru);
598 zone->free_area[order].nr_free--; 598 zone->free_area[order].nr_free--;
599 rmv_page_order(buddy); 599 rmv_page_order(buddy);
600 } 600 }
601 combined_idx = buddy_idx & page_idx; 601 combined_idx = buddy_idx & page_idx;
602 page = page + (combined_idx - page_idx); 602 page = page + (combined_idx - page_idx);
603 page_idx = combined_idx; 603 page_idx = combined_idx;
604 order++; 604 order++;
605 } 605 }
606 set_page_order(page, order); 606 set_page_order(page, order);
607 607
608 /* 608 /*
609 * If this is not the largest possible page, check if the buddy 609 * If this is not the largest possible page, check if the buddy
610 * of the next-highest order is free. If it is, it's possible 610 * of the next-highest order is free. If it is, it's possible
611 * that pages are being freed that will coalesce soon. In case, 611 * that pages are being freed that will coalesce soon. In case,
612 * that is happening, add the free page to the tail of the list 612 * that is happening, add the free page to the tail of the list
613 * so it's less likely to be used soon and more likely to be merged 613 * so it's less likely to be used soon and more likely to be merged
614 * as a higher order page 614 * as a higher order page
615 */ 615 */
616 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 616 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
617 struct page *higher_page, *higher_buddy; 617 struct page *higher_page, *higher_buddy;
618 combined_idx = buddy_idx & page_idx; 618 combined_idx = buddy_idx & page_idx;
619 higher_page = page + (combined_idx - page_idx); 619 higher_page = page + (combined_idx - page_idx);
620 buddy_idx = __find_buddy_index(combined_idx, order + 1); 620 buddy_idx = __find_buddy_index(combined_idx, order + 1);
621 higher_buddy = higher_page + (buddy_idx - combined_idx); 621 higher_buddy = higher_page + (buddy_idx - combined_idx);
622 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 622 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
623 list_add_tail(&page->lru, 623 list_add_tail(&page->lru,
624 &zone->free_area[order].free_list[migratetype]); 624 &zone->free_area[order].free_list[migratetype]);
625 goto out; 625 goto out;
626 } 626 }
627 } 627 }
628 628
629 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 629 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
630 out: 630 out:
631 zone->free_area[order].nr_free++; 631 zone->free_area[order].nr_free++;
632 } 632 }
633 633
634 static inline int free_pages_check(struct page *page) 634 static inline int free_pages_check(struct page *page)
635 { 635 {
636 if (unlikely(page_mapcount(page) | 636 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 637 (page->mapping != NULL) |
638 (atomic_read(&page->_count) != 0) | 638 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 639 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
640 (mem_cgroup_bad_page_check(page)))) { 640 (mem_cgroup_bad_page_check(page)))) {
641 bad_page(page); 641 bad_page(page);
642 return 1; 642 return 1;
643 } 643 }
644 page_nid_reset_last(page); 644 page_nid_reset_last(page);
645 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 645 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
646 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 646 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
647 return 0; 647 return 0;
648 } 648 }
649 649
650 /* 650 /*
651 * Frees a number of pages from the PCP lists 651 * Frees a number of pages from the PCP lists
652 * Assumes all pages on list are in same zone, and of same order. 652 * Assumes all pages on list are in same zone, and of same order.
653 * count is the number of pages to free. 653 * count is the number of pages to free.
654 * 654 *
655 * If the zone was previously in an "all pages pinned" state then look to 655 * If the zone was previously in an "all pages pinned" state then look to
656 * see if this freeing clears that state. 656 * see if this freeing clears that state.
657 * 657 *
658 * And clear the zone's pages_scanned counter, to hold off the "all pages are 658 * And clear the zone's pages_scanned counter, to hold off the "all pages are
659 * pinned" detection logic. 659 * pinned" detection logic.
660 */ 660 */
661 static void free_pcppages_bulk(struct zone *zone, int count, 661 static void free_pcppages_bulk(struct zone *zone, int count,
662 struct per_cpu_pages *pcp) 662 struct per_cpu_pages *pcp)
663 { 663 {
664 int migratetype = 0; 664 int migratetype = 0;
665 int batch_free = 0; 665 int batch_free = 0;
666 int to_free = count; 666 int to_free = count;
667 unsigned long nr_scanned; 667 unsigned long nr_scanned;
668 668
669 spin_lock(&zone->lock); 669 spin_lock(&zone->lock);
670 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 670 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
671 if (nr_scanned) 671 if (nr_scanned)
672 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 672 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
673 673
674 while (to_free) { 674 while (to_free) {
675 struct page *page; 675 struct page *page;
676 struct list_head *list; 676 struct list_head *list;
677 677
678 /* 678 /*
679 * Remove pages from lists in a round-robin fashion. A 679 * Remove pages from lists in a round-robin fashion. A
680 * batch_free count is maintained that is incremented when an 680 * batch_free count is maintained that is incremented when an
681 * empty list is encountered. This is so more pages are freed 681 * empty list is encountered. This is so more pages are freed
682 * off fuller lists instead of spinning excessively around empty 682 * off fuller lists instead of spinning excessively around empty
683 * lists 683 * lists
684 */ 684 */
685 do { 685 do {
686 batch_free++; 686 batch_free++;
687 if (++migratetype == MIGRATE_PCPTYPES) 687 if (++migratetype == MIGRATE_PCPTYPES)
688 migratetype = 0; 688 migratetype = 0;
689 list = &pcp->lists[migratetype]; 689 list = &pcp->lists[migratetype];
690 } while (list_empty(list)); 690 } while (list_empty(list));
691 691
692 /* This is the only non-empty list. Free them all. */ 692 /* This is the only non-empty list. Free them all. */
693 if (batch_free == MIGRATE_PCPTYPES) 693 if (batch_free == MIGRATE_PCPTYPES)
694 batch_free = to_free; 694 batch_free = to_free;
695 695
696 do { 696 do {
697 int mt; /* migratetype of the to-be-freed page */ 697 int mt; /* migratetype of the to-be-freed page */
698 698
699 page = list_entry(list->prev, struct page, lru); 699 page = list_entry(list->prev, struct page, lru);
700 /* must delete as __free_one_page list manipulates */ 700 /* must delete as __free_one_page list manipulates */
701 list_del(&page->lru); 701 list_del(&page->lru);
702 mt = get_freepage_migratetype(page); 702 mt = get_freepage_migratetype(page);
703 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 703 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
704 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 704 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
705 trace_mm_page_pcpu_drain(page, 0, mt); 705 trace_mm_page_pcpu_drain(page, 0, mt);
706 if (likely(!is_migrate_isolate_page(page))) { 706 if (likely(!is_migrate_isolate_page(page))) {
707 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 707 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
708 if (is_migrate_cma(mt)) 708 if (is_migrate_cma(mt))
709 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 709 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
710 } 710 }
711 } while (--to_free && --batch_free && !list_empty(list)); 711 } while (--to_free && --batch_free && !list_empty(list));
712 } 712 }
713 spin_unlock(&zone->lock); 713 spin_unlock(&zone->lock);
714 } 714 }
715 715
716 static void free_one_page(struct zone *zone, 716 static void free_one_page(struct zone *zone,
717 struct page *page, unsigned long pfn, 717 struct page *page, unsigned long pfn,
718 unsigned int order, 718 unsigned int order,
719 int migratetype) 719 int migratetype)
720 { 720 {
721 unsigned long nr_scanned; 721 unsigned long nr_scanned;
722 spin_lock(&zone->lock); 722 spin_lock(&zone->lock);
723 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 723 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
724 if (nr_scanned) 724 if (nr_scanned)
725 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 725 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
726 726
727 __free_one_page(page, pfn, zone, order, migratetype); 727 __free_one_page(page, pfn, zone, order, migratetype);
728 if (unlikely(!is_migrate_isolate(migratetype))) 728 if (unlikely(!is_migrate_isolate(migratetype)))
729 __mod_zone_freepage_state(zone, 1 << order, migratetype); 729 __mod_zone_freepage_state(zone, 1 << order, migratetype);
730 spin_unlock(&zone->lock); 730 spin_unlock(&zone->lock);
731 } 731 }
732 732
733 static bool free_pages_prepare(struct page *page, unsigned int order) 733 static bool free_pages_prepare(struct page *page, unsigned int order)
734 { 734 {
735 int i; 735 int i;
736 int bad = 0; 736 int bad = 0;
737 737
738 trace_mm_page_free(page, order); 738 trace_mm_page_free(page, order);
739 kmemcheck_free_shadow(page, order); 739 kmemcheck_free_shadow(page, order);
740 740
741 if (PageAnon(page)) 741 if (PageAnon(page))
742 page->mapping = NULL; 742 page->mapping = NULL;
743 for (i = 0; i < (1 << order); i++) 743 for (i = 0; i < (1 << order); i++)
744 bad += free_pages_check(page + i); 744 bad += free_pages_check(page + i);
745 if (bad) 745 if (bad)
746 return false; 746 return false;
747 747
748 if (!PageHighMem(page)) { 748 if (!PageHighMem(page)) {
749 debug_check_no_locks_freed(page_address(page), 749 debug_check_no_locks_freed(page_address(page),
750 PAGE_SIZE << order); 750 PAGE_SIZE << order);
751 debug_check_no_obj_freed(page_address(page), 751 debug_check_no_obj_freed(page_address(page),
752 PAGE_SIZE << order); 752 PAGE_SIZE << order);
753 } 753 }
754 arch_free_page(page, order); 754 arch_free_page(page, order);
755 kernel_map_pages(page, 1 << order, 0); 755 kernel_map_pages(page, 1 << order, 0);
756 756
757 return true; 757 return true;
758 } 758 }
759 759
760 static void __free_pages_ok(struct page *page, unsigned int order) 760 static void __free_pages_ok(struct page *page, unsigned int order)
761 { 761 {
762 unsigned long flags; 762 unsigned long flags;
763 int migratetype; 763 int migratetype;
764 unsigned long pfn = page_to_pfn(page); 764 unsigned long pfn = page_to_pfn(page);
765 765
766 if (!free_pages_prepare(page, order)) 766 if (!free_pages_prepare(page, order))
767 return; 767 return;
768 768
769 migratetype = get_pfnblock_migratetype(page, pfn); 769 migratetype = get_pfnblock_migratetype(page, pfn);
770 local_irq_save(flags); 770 local_irq_save(flags);
771 __count_vm_events(PGFREE, 1 << order); 771 __count_vm_events(PGFREE, 1 << order);
772 set_freepage_migratetype(page, migratetype); 772 set_freepage_migratetype(page, migratetype);
773 free_one_page(page_zone(page), page, pfn, order, migratetype); 773 free_one_page(page_zone(page), page, pfn, order, migratetype);
774 local_irq_restore(flags); 774 local_irq_restore(flags);
775 } 775 }
776 776
777 void __init __free_pages_bootmem(struct page *page, unsigned int order) 777 void __init __free_pages_bootmem(struct page *page, unsigned int order)
778 { 778 {
779 unsigned int nr_pages = 1 << order; 779 unsigned int nr_pages = 1 << order;
780 struct page *p = page; 780 struct page *p = page;
781 unsigned int loop; 781 unsigned int loop;
782 782
783 prefetchw(p); 783 prefetchw(p);
784 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 784 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
785 prefetchw(p + 1); 785 prefetchw(p + 1);
786 __ClearPageReserved(p); 786 __ClearPageReserved(p);
787 set_page_count(p, 0); 787 set_page_count(p, 0);
788 } 788 }
789 __ClearPageReserved(p); 789 __ClearPageReserved(p);
790 set_page_count(p, 0); 790 set_page_count(p, 0);
791 791
792 page_zone(page)->managed_pages += nr_pages; 792 page_zone(page)->managed_pages += nr_pages;
793 set_page_refcounted(page); 793 set_page_refcounted(page);
794 __free_pages(page, order); 794 __free_pages(page, order);
795 } 795 }
796 796
797 #ifdef CONFIG_CMA 797 #ifdef CONFIG_CMA
798 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 798 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
799 void __init init_cma_reserved_pageblock(struct page *page) 799 void __init init_cma_reserved_pageblock(struct page *page)
800 { 800 {
801 unsigned i = pageblock_nr_pages; 801 unsigned i = pageblock_nr_pages;
802 struct page *p = page; 802 struct page *p = page;
803 803
804 do { 804 do {
805 __ClearPageReserved(p); 805 __ClearPageReserved(p);
806 set_page_count(p, 0); 806 set_page_count(p, 0);
807 } while (++p, --i); 807 } while (++p, --i);
808 808
809 set_pageblock_migratetype(page, MIGRATE_CMA); 809 set_pageblock_migratetype(page, MIGRATE_CMA);
810 810
811 if (pageblock_order >= MAX_ORDER) { 811 if (pageblock_order >= MAX_ORDER) {
812 i = pageblock_nr_pages; 812 i = pageblock_nr_pages;
813 p = page; 813 p = page;
814 do { 814 do {
815 set_page_refcounted(p); 815 set_page_refcounted(p);
816 __free_pages(p, MAX_ORDER - 1); 816 __free_pages(p, MAX_ORDER - 1);
817 p += MAX_ORDER_NR_PAGES; 817 p += MAX_ORDER_NR_PAGES;
818 } while (i -= MAX_ORDER_NR_PAGES); 818 } while (i -= MAX_ORDER_NR_PAGES);
819 } else { 819 } else {
820 set_page_refcounted(page); 820 set_page_refcounted(page);
821 __free_pages(page, pageblock_order); 821 __free_pages(page, pageblock_order);
822 } 822 }
823 823
824 adjust_managed_page_count(page, pageblock_nr_pages); 824 adjust_managed_page_count(page, pageblock_nr_pages);
825 } 825 }
826 #endif 826 #endif
827 827
828 /* 828 /*
829 * The order of subdivision here is critical for the IO subsystem. 829 * The order of subdivision here is critical for the IO subsystem.
830 * Please do not alter this order without good reasons and regression 830 * Please do not alter this order without good reasons and regression
831 * testing. Specifically, as large blocks of memory are subdivided, 831 * testing. Specifically, as large blocks of memory are subdivided,
832 * the order in which smaller blocks are delivered depends on the order 832 * the order in which smaller blocks are delivered depends on the order
833 * they're subdivided in this function. This is the primary factor 833 * they're subdivided in this function. This is the primary factor
834 * influencing the order in which pages are delivered to the IO 834 * influencing the order in which pages are delivered to the IO
835 * subsystem according to empirical testing, and this is also justified 835 * subsystem according to empirical testing, and this is also justified
836 * by considering the behavior of a buddy system containing a single 836 * by considering the behavior of a buddy system containing a single
837 * large block of memory acted on by a series of small allocations. 837 * large block of memory acted on by a series of small allocations.
838 * This behavior is a critical factor in sglist merging's success. 838 * This behavior is a critical factor in sglist merging's success.
839 * 839 *
840 * -- nyc 840 * -- nyc
841 */ 841 */
842 static inline void expand(struct zone *zone, struct page *page, 842 static inline void expand(struct zone *zone, struct page *page,
843 int low, int high, struct free_area *area, 843 int low, int high, struct free_area *area,
844 int migratetype) 844 int migratetype)
845 { 845 {
846 unsigned long size = 1 << high; 846 unsigned long size = 1 << high;
847 847
848 while (high > low) { 848 while (high > low) {
849 area--; 849 area--;
850 high--; 850 high--;
851 size >>= 1; 851 size >>= 1;
852 VM_BUG_ON(bad_range(zone, &page[size])); 852 VM_BUG_ON(bad_range(zone, &page[size]));
853 853
854 #ifdef CONFIG_DEBUG_PAGEALLOC 854 #ifdef CONFIG_DEBUG_PAGEALLOC
855 if (high < debug_guardpage_minorder()) { 855 if (high < debug_guardpage_minorder()) {
856 /* 856 /*
857 * Mark as guard pages (or page), that will allow to 857 * Mark as guard pages (or page), that will allow to
858 * merge back to allocator when buddy will be freed. 858 * merge back to allocator when buddy will be freed.
859 * Corresponding page table entries will not be touched, 859 * Corresponding page table entries will not be touched,
860 * pages will stay not present in virtual address space 860 * pages will stay not present in virtual address space
861 */ 861 */
862 INIT_LIST_HEAD(&page[size].lru); 862 INIT_LIST_HEAD(&page[size].lru);
863 set_page_guard_flag(&page[size]); 863 set_page_guard_flag(&page[size]);
864 set_page_private(&page[size], high); 864 set_page_private(&page[size], high);
865 /* Guard pages are not available for any usage */ 865 /* Guard pages are not available for any usage */
866 __mod_zone_freepage_state(zone, -(1 << high), 866 __mod_zone_freepage_state(zone, -(1 << high),
867 migratetype); 867 migratetype);
868 continue; 868 continue;
869 } 869 }
870 #endif 870 #endif
871 list_add(&page[size].lru, &area->free_list[migratetype]); 871 list_add(&page[size].lru, &area->free_list[migratetype]);
872 area->nr_free++; 872 area->nr_free++;
873 set_page_order(&page[size], high); 873 set_page_order(&page[size], high);
874 } 874 }
875 } 875 }
876 876
877 /* 877 /*
878 * This page is about to be returned from the page allocator 878 * This page is about to be returned from the page allocator
879 */ 879 */
880 static inline int check_new_page(struct page *page) 880 static inline int check_new_page(struct page *page)
881 { 881 {
882 if (unlikely(page_mapcount(page) | 882 if (unlikely(page_mapcount(page) |
883 (page->mapping != NULL) | 883 (page->mapping != NULL) |
884 (atomic_read(&page->_count) != 0) | 884 (atomic_read(&page->_count) != 0) |
885 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 885 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
886 (mem_cgroup_bad_page_check(page)))) { 886 (mem_cgroup_bad_page_check(page)))) {
887 bad_page(page); 887 bad_page(page);
888 return 1; 888 return 1;
889 } 889 }
890 return 0; 890 return 0;
891 } 891 }
892 892
893 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 893 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
894 { 894 {
895 int i; 895 int i;
896 896
897 for (i = 0; i < (1 << order); i++) { 897 for (i = 0; i < (1 << order); i++) {
898 struct page *p = page + i; 898 struct page *p = page + i;
899 if (unlikely(check_new_page(p))) 899 if (unlikely(check_new_page(p)))
900 return 1; 900 return 1;
901 } 901 }
902 902
903 set_page_private(page, 0); 903 set_page_private(page, 0);
904 set_page_refcounted(page); 904 set_page_refcounted(page);
905 905
906 arch_alloc_page(page, order); 906 arch_alloc_page(page, order);
907 kernel_map_pages(page, 1 << order, 1); 907 kernel_map_pages(page, 1 << order, 1);
908 908
909 if (gfp_flags & __GFP_ZERO) 909 if (gfp_flags & __GFP_ZERO)
910 prep_zero_page(page, order, gfp_flags); 910 prep_zero_page(page, order, gfp_flags);
911 911
912 if (order && (gfp_flags & __GFP_COMP)) 912 if (order && (gfp_flags & __GFP_COMP))
913 prep_compound_page(page, order); 913 prep_compound_page(page, order);
914 914
915 return 0; 915 return 0;
916 } 916 }
917 917
918 /* 918 /*
919 * Go through the free lists for the given migratetype and remove 919 * Go through the free lists for the given migratetype and remove
920 * the smallest available page from the freelists 920 * the smallest available page from the freelists
921 */ 921 */
922 static inline 922 static inline
923 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 923 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
924 int migratetype) 924 int migratetype)
925 { 925 {
926 unsigned int current_order; 926 unsigned int current_order;
927 struct free_area *area; 927 struct free_area *area;
928 struct page *page; 928 struct page *page;
929 929
930 /* Find a page of the appropriate size in the preferred list */ 930 /* Find a page of the appropriate size in the preferred list */
931 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 931 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
932 area = &(zone->free_area[current_order]); 932 area = &(zone->free_area[current_order]);
933 if (list_empty(&area->free_list[migratetype])) 933 if (list_empty(&area->free_list[migratetype]))
934 continue; 934 continue;
935 935
936 page = list_entry(area->free_list[migratetype].next, 936 page = list_entry(area->free_list[migratetype].next,
937 struct page, lru); 937 struct page, lru);
938 list_del(&page->lru); 938 list_del(&page->lru);
939 rmv_page_order(page); 939 rmv_page_order(page);
940 area->nr_free--; 940 area->nr_free--;
941 expand(zone, page, order, current_order, area, migratetype); 941 expand(zone, page, order, current_order, area, migratetype);
942 set_freepage_migratetype(page, migratetype); 942 set_freepage_migratetype(page, migratetype);
943 return page; 943 return page;
944 } 944 }
945 945
946 return NULL; 946 return NULL;
947 } 947 }
948 948
949 949
950 /* 950 /*
951 * This array describes the order lists are fallen back to when 951 * This array describes the order lists are fallen back to when
952 * the free lists for the desirable migrate type are depleted 952 * the free lists for the desirable migrate type are depleted
953 */ 953 */
954 static int fallbacks[MIGRATE_TYPES][4] = { 954 static int fallbacks[MIGRATE_TYPES][4] = {
955 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 955 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
956 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 956 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
957 #ifdef CONFIG_CMA 957 #ifdef CONFIG_CMA
958 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 958 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
959 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 959 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
960 #else 960 #else
961 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 961 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
962 #endif 962 #endif
963 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 963 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
964 #ifdef CONFIG_MEMORY_ISOLATION 964 #ifdef CONFIG_MEMORY_ISOLATION
965 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 965 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
966 #endif 966 #endif
967 }; 967 };
968 968
969 /* 969 /*
970 * Move the free pages in a range to the free lists of the requested type. 970 * Move the free pages in a range to the free lists of the requested type.
971 * Note that start_page and end_pages are not aligned on a pageblock 971 * Note that start_page and end_pages are not aligned on a pageblock
972 * boundary. If alignment is required, use move_freepages_block() 972 * boundary. If alignment is required, use move_freepages_block()
973 */ 973 */
974 int move_freepages(struct zone *zone, 974 int move_freepages(struct zone *zone,
975 struct page *start_page, struct page *end_page, 975 struct page *start_page, struct page *end_page,
976 int migratetype) 976 int migratetype)
977 { 977 {
978 struct page *page; 978 struct page *page;
979 unsigned long order; 979 unsigned long order;
980 int pages_moved = 0; 980 int pages_moved = 0;
981 981
982 #ifndef CONFIG_HOLES_IN_ZONE 982 #ifndef CONFIG_HOLES_IN_ZONE
983 /* 983 /*
984 * page_zone is not safe to call in this context when 984 * page_zone is not safe to call in this context when
985 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 985 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
986 * anyway as we check zone boundaries in move_freepages_block(). 986 * anyway as we check zone boundaries in move_freepages_block().
987 * Remove at a later date when no bug reports exist related to 987 * Remove at a later date when no bug reports exist related to
988 * grouping pages by mobility 988 * grouping pages by mobility
989 */ 989 */
990 BUG_ON(page_zone(start_page) != page_zone(end_page)); 990 BUG_ON(page_zone(start_page) != page_zone(end_page));
991 #endif 991 #endif
992 992
993 for (page = start_page; page <= end_page;) { 993 for (page = start_page; page <= end_page;) {
994 /* Make sure we are not inadvertently changing nodes */ 994 /* Make sure we are not inadvertently changing nodes */
995 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 995 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
996 996
997 if (!pfn_valid_within(page_to_pfn(page))) { 997 if (!pfn_valid_within(page_to_pfn(page))) {
998 page++; 998 page++;
999 continue; 999 continue;
1000 } 1000 }
1001 1001
1002 if (!PageBuddy(page)) { 1002 if (!PageBuddy(page)) {
1003 page++; 1003 page++;
1004 continue; 1004 continue;
1005 } 1005 }
1006 1006
1007 order = page_order(page); 1007 order = page_order(page);
1008 list_move(&page->lru, 1008 list_move(&page->lru,
1009 &zone->free_area[order].free_list[migratetype]); 1009 &zone->free_area[order].free_list[migratetype]);
1010 set_freepage_migratetype(page, migratetype); 1010 set_freepage_migratetype(page, migratetype);
1011 page += 1 << order; 1011 page += 1 << order;
1012 pages_moved += 1 << order; 1012 pages_moved += 1 << order;
1013 } 1013 }
1014 1014
1015 return pages_moved; 1015 return pages_moved;
1016 } 1016 }
1017 1017
1018 int move_freepages_block(struct zone *zone, struct page *page, 1018 int move_freepages_block(struct zone *zone, struct page *page,
1019 int migratetype) 1019 int migratetype)
1020 { 1020 {
1021 unsigned long start_pfn, end_pfn; 1021 unsigned long start_pfn, end_pfn;
1022 struct page *start_page, *end_page; 1022 struct page *start_page, *end_page;
1023 1023
1024 start_pfn = page_to_pfn(page); 1024 start_pfn = page_to_pfn(page);
1025 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1025 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1026 start_page = pfn_to_page(start_pfn); 1026 start_page = pfn_to_page(start_pfn);
1027 end_page = start_page + pageblock_nr_pages - 1; 1027 end_page = start_page + pageblock_nr_pages - 1;
1028 end_pfn = start_pfn + pageblock_nr_pages - 1; 1028 end_pfn = start_pfn + pageblock_nr_pages - 1;
1029 1029
1030 /* Do not cross zone boundaries */ 1030 /* Do not cross zone boundaries */
1031 if (!zone_spans_pfn(zone, start_pfn)) 1031 if (!zone_spans_pfn(zone, start_pfn))
1032 start_page = page; 1032 start_page = page;
1033 if (!zone_spans_pfn(zone, end_pfn)) 1033 if (!zone_spans_pfn(zone, end_pfn))
1034 return 0; 1034 return 0;
1035 1035
1036 return move_freepages(zone, start_page, end_page, migratetype); 1036 return move_freepages(zone, start_page, end_page, migratetype);
1037 } 1037 }
1038 1038
1039 static void change_pageblock_range(struct page *pageblock_page, 1039 static void change_pageblock_range(struct page *pageblock_page,
1040 int start_order, int migratetype) 1040 int start_order, int migratetype)
1041 { 1041 {
1042 int nr_pageblocks = 1 << (start_order - pageblock_order); 1042 int nr_pageblocks = 1 << (start_order - pageblock_order);
1043 1043
1044 while (nr_pageblocks--) { 1044 while (nr_pageblocks--) {
1045 set_pageblock_migratetype(pageblock_page, migratetype); 1045 set_pageblock_migratetype(pageblock_page, migratetype);
1046 pageblock_page += pageblock_nr_pages; 1046 pageblock_page += pageblock_nr_pages;
1047 } 1047 }
1048 } 1048 }
1049 1049
1050 /* 1050 /*
1051 * If breaking a large block of pages, move all free pages to the preferred 1051 * If breaking a large block of pages, move all free pages to the preferred
1052 * allocation list. If falling back for a reclaimable kernel allocation, be 1052 * allocation list. If falling back for a reclaimable kernel allocation, be
1053 * more aggressive about taking ownership of free pages. 1053 * more aggressive about taking ownership of free pages.
1054 * 1054 *
1055 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1055 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1056 * nor move CMA pages to different free lists. We don't want unmovable pages 1056 * nor move CMA pages to different free lists. We don't want unmovable pages
1057 * to be allocated from MIGRATE_CMA areas. 1057 * to be allocated from MIGRATE_CMA areas.
1058 * 1058 *
1059 * Returns the new migratetype of the pageblock (or the same old migratetype 1059 * Returns the new migratetype of the pageblock (or the same old migratetype
1060 * if it was unchanged). 1060 * if it was unchanged).
1061 */ 1061 */
1062 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1062 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1063 int start_type, int fallback_type) 1063 int start_type, int fallback_type)
1064 { 1064 {
1065 int current_order = page_order(page); 1065 int current_order = page_order(page);
1066 1066
1067 /* 1067 /*
1068 * When borrowing from MIGRATE_CMA, we need to release the excess 1068 * When borrowing from MIGRATE_CMA, we need to release the excess
1069 * buddy pages to CMA itself. We also ensure the freepage_migratetype 1069 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1070 * is set to CMA so it is returned to the correct freelist in case 1070 * is set to CMA so it is returned to the correct freelist in case
1071 * the page ends up being not actually allocated from the pcp lists. 1071 * the page ends up being not actually allocated from the pcp lists.
1072 */ 1072 */
1073 if (is_migrate_cma(fallback_type)) 1073 if (is_migrate_cma(fallback_type))
1074 return fallback_type; 1074 return fallback_type;
1075 1075
1076 /* Take ownership for orders >= pageblock_order */ 1076 /* Take ownership for orders >= pageblock_order */
1077 if (current_order >= pageblock_order) { 1077 if (current_order >= pageblock_order) {
1078 change_pageblock_range(page, current_order, start_type); 1078 change_pageblock_range(page, current_order, start_type);
1079 return start_type; 1079 return start_type;
1080 } 1080 }
1081 1081
1082 if (current_order >= pageblock_order / 2 || 1082 if (current_order >= pageblock_order / 2 ||
1083 start_type == MIGRATE_RECLAIMABLE || 1083 start_type == MIGRATE_RECLAIMABLE ||
1084 page_group_by_mobility_disabled) { 1084 page_group_by_mobility_disabled) {
1085 int pages; 1085 int pages;
1086 1086
1087 pages = move_freepages_block(zone, page, start_type); 1087 pages = move_freepages_block(zone, page, start_type);
1088 1088
1089 /* Claim the whole block if over half of it is free */ 1089 /* Claim the whole block if over half of it is free */
1090 if (pages >= (1 << (pageblock_order-1)) || 1090 if (pages >= (1 << (pageblock_order-1)) ||
1091 page_group_by_mobility_disabled) { 1091 page_group_by_mobility_disabled) {
1092 1092
1093 set_pageblock_migratetype(page, start_type); 1093 set_pageblock_migratetype(page, start_type);
1094 return start_type; 1094 return start_type;
1095 } 1095 }
1096 1096
1097 } 1097 }
1098 1098
1099 return fallback_type; 1099 return fallback_type;
1100 } 1100 }
1101 1101
1102 /* Remove an element from the buddy allocator from the fallback list */ 1102 /* Remove an element from the buddy allocator from the fallback list */
1103 static inline struct page * 1103 static inline struct page *
1104 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) 1104 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1105 { 1105 {
1106 struct free_area *area; 1106 struct free_area *area;
1107 unsigned int current_order; 1107 unsigned int current_order;
1108 struct page *page; 1108 struct page *page;
1109 int migratetype, new_type, i; 1109 int migratetype, new_type, i;
1110 1110
1111 /* Find the largest possible block of pages in the other list */ 1111 /* Find the largest possible block of pages in the other list */
1112 for (current_order = MAX_ORDER-1; 1112 for (current_order = MAX_ORDER-1;
1113 current_order >= order && current_order <= MAX_ORDER-1; 1113 current_order >= order && current_order <= MAX_ORDER-1;
1114 --current_order) { 1114 --current_order) {
1115 for (i = 0;; i++) { 1115 for (i = 0;; i++) {
1116 migratetype = fallbacks[start_migratetype][i]; 1116 migratetype = fallbacks[start_migratetype][i];
1117 1117
1118 /* MIGRATE_RESERVE handled later if necessary */ 1118 /* MIGRATE_RESERVE handled later if necessary */
1119 if (migratetype == MIGRATE_RESERVE) 1119 if (migratetype == MIGRATE_RESERVE)
1120 break; 1120 break;
1121 1121
1122 area = &(zone->free_area[current_order]); 1122 area = &(zone->free_area[current_order]);
1123 if (list_empty(&area->free_list[migratetype])) 1123 if (list_empty(&area->free_list[migratetype]))
1124 continue; 1124 continue;
1125 1125
1126 page = list_entry(area->free_list[migratetype].next, 1126 page = list_entry(area->free_list[migratetype].next,
1127 struct page, lru); 1127 struct page, lru);
1128 area->nr_free--; 1128 area->nr_free--;
1129 1129
1130 new_type = try_to_steal_freepages(zone, page, 1130 new_type = try_to_steal_freepages(zone, page,
1131 start_migratetype, 1131 start_migratetype,
1132 migratetype); 1132 migratetype);
1133 1133
1134 /* Remove the page from the freelists */ 1134 /* Remove the page from the freelists */
1135 list_del(&page->lru); 1135 list_del(&page->lru);
1136 rmv_page_order(page); 1136 rmv_page_order(page);
1137 1137
1138 expand(zone, page, order, current_order, area, 1138 expand(zone, page, order, current_order, area,
1139 new_type); 1139 new_type);
1140 /* The freepage_migratetype may differ from pageblock's 1140 /* The freepage_migratetype may differ from pageblock's
1141 * migratetype depending on the decisions in 1141 * migratetype depending on the decisions in
1142 * try_to_steal_freepages. This is OK as long as it does 1142 * try_to_steal_freepages. This is OK as long as it does
1143 * not differ for MIGRATE_CMA type. 1143 * not differ for MIGRATE_CMA type.
1144 */ 1144 */
1145 set_freepage_migratetype(page, new_type); 1145 set_freepage_migratetype(page, new_type);
1146 1146
1147 trace_mm_page_alloc_extfrag(page, order, current_order, 1147 trace_mm_page_alloc_extfrag(page, order, current_order,
1148 start_migratetype, migratetype, new_type); 1148 start_migratetype, migratetype, new_type);
1149 1149
1150 return page; 1150 return page;
1151 } 1151 }
1152 } 1152 }
1153 1153
1154 return NULL; 1154 return NULL;
1155 } 1155 }
1156 1156
1157 /* 1157 /*
1158 * Do the hard work of removing an element from the buddy allocator. 1158 * Do the hard work of removing an element from the buddy allocator.
1159 * Call me with the zone->lock already held. 1159 * Call me with the zone->lock already held.
1160 */ 1160 */
1161 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1161 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1162 int migratetype) 1162 int migratetype)
1163 { 1163 {
1164 struct page *page; 1164 struct page *page;
1165 1165
1166 retry_reserve: 1166 retry_reserve:
1167 page = __rmqueue_smallest(zone, order, migratetype); 1167 page = __rmqueue_smallest(zone, order, migratetype);
1168 1168
1169 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1169 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1170 page = __rmqueue_fallback(zone, order, migratetype); 1170 page = __rmqueue_fallback(zone, order, migratetype);
1171 1171
1172 /* 1172 /*
1173 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1173 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1174 * is used because __rmqueue_smallest is an inline function 1174 * is used because __rmqueue_smallest is an inline function
1175 * and we want just one call site 1175 * and we want just one call site
1176 */ 1176 */
1177 if (!page) { 1177 if (!page) {
1178 migratetype = MIGRATE_RESERVE; 1178 migratetype = MIGRATE_RESERVE;
1179 goto retry_reserve; 1179 goto retry_reserve;
1180 } 1180 }
1181 } 1181 }
1182 1182
1183 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1183 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1184 return page; 1184 return page;
1185 } 1185 }
1186 1186
1187 /* 1187 /*
1188 * Obtain a specified number of elements from the buddy allocator, all under 1188 * Obtain a specified number of elements from the buddy allocator, all under
1189 * a single hold of the lock, for efficiency. Add them to the supplied list. 1189 * a single hold of the lock, for efficiency. Add them to the supplied list.
1190 * Returns the number of new pages which were placed at *list. 1190 * Returns the number of new pages which were placed at *list.
1191 */ 1191 */
1192 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1192 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1193 unsigned long count, struct list_head *list, 1193 unsigned long count, struct list_head *list,
1194 int migratetype, bool cold) 1194 int migratetype, bool cold)
1195 { 1195 {
1196 int i; 1196 int i;
1197 1197
1198 spin_lock(&zone->lock); 1198 spin_lock(&zone->lock);
1199 for (i = 0; i < count; ++i) { 1199 for (i = 0; i < count; ++i) {
1200 struct page *page = __rmqueue(zone, order, migratetype); 1200 struct page *page = __rmqueue(zone, order, migratetype);
1201 if (unlikely(page == NULL)) 1201 if (unlikely(page == NULL))
1202 break; 1202 break;
1203 1203
1204 /* 1204 /*
1205 * Split buddy pages returned by expand() are received here 1205 * Split buddy pages returned by expand() are received here
1206 * in physical page order. The page is added to the callers and 1206 * in physical page order. The page is added to the callers and
1207 * list and the list head then moves forward. From the callers 1207 * list and the list head then moves forward. From the callers
1208 * perspective, the linked list is ordered by page number in 1208 * perspective, the linked list is ordered by page number in
1209 * some conditions. This is useful for IO devices that can 1209 * some conditions. This is useful for IO devices that can
1210 * merge IO requests if the physical pages are ordered 1210 * merge IO requests if the physical pages are ordered
1211 * properly. 1211 * properly.
1212 */ 1212 */
1213 if (likely(!cold)) 1213 if (likely(!cold))
1214 list_add(&page->lru, list); 1214 list_add(&page->lru, list);
1215 else 1215 else
1216 list_add_tail(&page->lru, list); 1216 list_add_tail(&page->lru, list);
1217 list = &page->lru; 1217 list = &page->lru;
1218 if (is_migrate_cma(get_freepage_migratetype(page))) 1218 if (is_migrate_cma(get_freepage_migratetype(page)))
1219 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1219 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1220 -(1 << order)); 1220 -(1 << order));
1221 } 1221 }
1222 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1222 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1223 spin_unlock(&zone->lock); 1223 spin_unlock(&zone->lock);
1224 return i; 1224 return i;
1225 } 1225 }
1226 1226
1227 #ifdef CONFIG_NUMA 1227 #ifdef CONFIG_NUMA
1228 /* 1228 /*
1229 * Called from the vmstat counter updater to drain pagesets of this 1229 * Called from the vmstat counter updater to drain pagesets of this
1230 * currently executing processor on remote nodes after they have 1230 * currently executing processor on remote nodes after they have
1231 * expired. 1231 * expired.
1232 * 1232 *
1233 * Note that this function must be called with the thread pinned to 1233 * Note that this function must be called with the thread pinned to
1234 * a single processor. 1234 * a single processor.
1235 */ 1235 */
1236 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1236 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1237 { 1237 {
1238 unsigned long flags; 1238 unsigned long flags;
1239 int to_drain; 1239 int to_drain;
1240 unsigned long batch; 1240 unsigned long batch;
1241 1241
1242 local_irq_save(flags); 1242 local_irq_save(flags);
1243 batch = ACCESS_ONCE(pcp->batch); 1243 batch = ACCESS_ONCE(pcp->batch);
1244 if (pcp->count >= batch) 1244 if (pcp->count >= batch)
1245 to_drain = batch; 1245 to_drain = batch;
1246 else 1246 else
1247 to_drain = pcp->count; 1247 to_drain = pcp->count;
1248 if (to_drain > 0) { 1248 if (to_drain > 0) {
1249 free_pcppages_bulk(zone, to_drain, pcp); 1249 free_pcppages_bulk(zone, to_drain, pcp);
1250 pcp->count -= to_drain; 1250 pcp->count -= to_drain;
1251 } 1251 }
1252 local_irq_restore(flags); 1252 local_irq_restore(flags);
1253 } 1253 }
1254 #endif 1254 #endif
1255 1255
1256 /* 1256 /*
1257 * Drain pages of the indicated processor. 1257 * Drain pages of the indicated processor.
1258 * 1258 *
1259 * The processor must either be the current processor and the 1259 * The processor must either be the current processor and the
1260 * thread pinned to the current processor or a processor that 1260 * thread pinned to the current processor or a processor that
1261 * is not online. 1261 * is not online.
1262 */ 1262 */
1263 static void drain_pages(unsigned int cpu) 1263 static void drain_pages(unsigned int cpu)
1264 { 1264 {
1265 unsigned long flags; 1265 unsigned long flags;
1266 struct zone *zone; 1266 struct zone *zone;
1267 1267
1268 for_each_populated_zone(zone) { 1268 for_each_populated_zone(zone) {
1269 struct per_cpu_pageset *pset; 1269 struct per_cpu_pageset *pset;
1270 struct per_cpu_pages *pcp; 1270 struct per_cpu_pages *pcp;
1271 1271
1272 local_irq_save(flags); 1272 local_irq_save(flags);
1273 pset = per_cpu_ptr(zone->pageset, cpu); 1273 pset = per_cpu_ptr(zone->pageset, cpu);
1274 1274
1275 pcp = &pset->pcp; 1275 pcp = &pset->pcp;
1276 if (pcp->count) { 1276 if (pcp->count) {
1277 free_pcppages_bulk(zone, pcp->count, pcp); 1277 free_pcppages_bulk(zone, pcp->count, pcp);
1278 pcp->count = 0; 1278 pcp->count = 0;
1279 } 1279 }
1280 local_irq_restore(flags); 1280 local_irq_restore(flags);
1281 } 1281 }
1282 } 1282 }
1283 1283
1284 /* 1284 /*
1285 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1285 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1286 */ 1286 */
1287 void drain_local_pages(void *arg) 1287 void drain_local_pages(void *arg)
1288 { 1288 {
1289 drain_pages(smp_processor_id()); 1289 drain_pages(smp_processor_id());
1290 } 1290 }
1291 1291
1292 /* 1292 /*
1293 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1293 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1294 * 1294 *
1295 * Note that this code is protected against sending an IPI to an offline 1295 * Note that this code is protected against sending an IPI to an offline
1296 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1296 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1297 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1297 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1298 * nothing keeps CPUs from showing up after we populated the cpumask and 1298 * nothing keeps CPUs from showing up after we populated the cpumask and
1299 * before the call to on_each_cpu_mask(). 1299 * before the call to on_each_cpu_mask().
1300 */ 1300 */
1301 void drain_all_pages(void) 1301 void drain_all_pages(void)
1302 { 1302 {
1303 int cpu; 1303 int cpu;
1304 struct per_cpu_pageset *pcp; 1304 struct per_cpu_pageset *pcp;
1305 struct zone *zone; 1305 struct zone *zone;
1306 1306
1307 /* 1307 /*
1308 * Allocate in the BSS so we wont require allocation in 1308 * Allocate in the BSS so we wont require allocation in
1309 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1309 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1310 */ 1310 */
1311 static cpumask_t cpus_with_pcps; 1311 static cpumask_t cpus_with_pcps;
1312 1312
1313 /* 1313 /*
1314 * We don't care about racing with CPU hotplug event 1314 * We don't care about racing with CPU hotplug event
1315 * as offline notification will cause the notified 1315 * as offline notification will cause the notified
1316 * cpu to drain that CPU pcps and on_each_cpu_mask 1316 * cpu to drain that CPU pcps and on_each_cpu_mask
1317 * disables preemption as part of its processing 1317 * disables preemption as part of its processing
1318 */ 1318 */
1319 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1320 bool has_pcps = false; 1320 bool has_pcps = false;
1321 for_each_populated_zone(zone) { 1321 for_each_populated_zone(zone) {
1322 pcp = per_cpu_ptr(zone->pageset, cpu); 1322 pcp = per_cpu_ptr(zone->pageset, cpu);
1323 if (pcp->pcp.count) { 1323 if (pcp->pcp.count) {
1324 has_pcps = true; 1324 has_pcps = true;
1325 break; 1325 break;
1326 } 1326 }
1327 } 1327 }
1328 if (has_pcps) 1328 if (has_pcps)
1329 cpumask_set_cpu(cpu, &cpus_with_pcps); 1329 cpumask_set_cpu(cpu, &cpus_with_pcps);
1330 else 1330 else
1331 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1331 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1332 } 1332 }
1333 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1333 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1334 } 1334 }
1335 1335
1336 #ifdef CONFIG_HIBERNATION 1336 #ifdef CONFIG_HIBERNATION
1337 1337
1338 void mark_free_pages(struct zone *zone) 1338 void mark_free_pages(struct zone *zone)
1339 { 1339 {
1340 unsigned long pfn, max_zone_pfn; 1340 unsigned long pfn, max_zone_pfn;
1341 unsigned long flags; 1341 unsigned long flags;
1342 unsigned int order, t; 1342 unsigned int order, t;
1343 struct list_head *curr; 1343 struct list_head *curr;
1344 1344
1345 if (zone_is_empty(zone)) 1345 if (zone_is_empty(zone))
1346 return; 1346 return;
1347 1347
1348 spin_lock_irqsave(&zone->lock, flags); 1348 spin_lock_irqsave(&zone->lock, flags);
1349 1349
1350 max_zone_pfn = zone_end_pfn(zone); 1350 max_zone_pfn = zone_end_pfn(zone);
1351 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1351 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1352 if (pfn_valid(pfn)) { 1352 if (pfn_valid(pfn)) {
1353 struct page *page = pfn_to_page(pfn); 1353 struct page *page = pfn_to_page(pfn);
1354 1354
1355 if (!swsusp_page_is_forbidden(page)) 1355 if (!swsusp_page_is_forbidden(page))
1356 swsusp_unset_page_free(page); 1356 swsusp_unset_page_free(page);
1357 } 1357 }
1358 1358
1359 for_each_migratetype_order(order, t) { 1359 for_each_migratetype_order(order, t) {
1360 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1360 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1361 unsigned long i; 1361 unsigned long i;
1362 1362
1363 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1363 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1364 for (i = 0; i < (1UL << order); i++) 1364 for (i = 0; i < (1UL << order); i++)
1365 swsusp_set_page_free(pfn_to_page(pfn + i)); 1365 swsusp_set_page_free(pfn_to_page(pfn + i));
1366 } 1366 }
1367 } 1367 }
1368 spin_unlock_irqrestore(&zone->lock, flags); 1368 spin_unlock_irqrestore(&zone->lock, flags);
1369 } 1369 }
1370 #endif /* CONFIG_PM */ 1370 #endif /* CONFIG_PM */
1371 1371
1372 /* 1372 /*
1373 * Free a 0-order page 1373 * Free a 0-order page
1374 * cold == true ? free a cold page : free a hot page 1374 * cold == true ? free a cold page : free a hot page
1375 */ 1375 */
1376 void free_hot_cold_page(struct page *page, bool cold) 1376 void free_hot_cold_page(struct page *page, bool cold)
1377 { 1377 {
1378 struct zone *zone = page_zone(page); 1378 struct zone *zone = page_zone(page);
1379 struct per_cpu_pages *pcp; 1379 struct per_cpu_pages *pcp;
1380 unsigned long flags; 1380 unsigned long flags;
1381 unsigned long pfn = page_to_pfn(page); 1381 unsigned long pfn = page_to_pfn(page);
1382 int migratetype; 1382 int migratetype;
1383 1383
1384 if (!free_pages_prepare(page, 0)) 1384 if (!free_pages_prepare(page, 0))
1385 return; 1385 return;
1386 1386
1387 migratetype = get_pfnblock_migratetype(page, pfn); 1387 migratetype = get_pfnblock_migratetype(page, pfn);
1388 set_freepage_migratetype(page, migratetype); 1388 set_freepage_migratetype(page, migratetype);
1389 local_irq_save(flags); 1389 local_irq_save(flags);
1390 __count_vm_event(PGFREE); 1390 __count_vm_event(PGFREE);
1391 1391
1392 /* 1392 /*
1393 * We only track unmovable, reclaimable and movable on pcp lists. 1393 * We only track unmovable, reclaimable and movable on pcp lists.
1394 * Free ISOLATE pages back to the allocator because they are being 1394 * Free ISOLATE pages back to the allocator because they are being
1395 * offlined but treat RESERVE as movable pages so we can get those 1395 * offlined but treat RESERVE as movable pages so we can get those
1396 * areas back if necessary. Otherwise, we may have to free 1396 * areas back if necessary. Otherwise, we may have to free
1397 * excessively into the page allocator 1397 * excessively into the page allocator
1398 */ 1398 */
1399 if (migratetype >= MIGRATE_PCPTYPES) { 1399 if (migratetype >= MIGRATE_PCPTYPES) {
1400 if (unlikely(is_migrate_isolate(migratetype))) { 1400 if (unlikely(is_migrate_isolate(migratetype))) {
1401 free_one_page(zone, page, pfn, 0, migratetype); 1401 free_one_page(zone, page, pfn, 0, migratetype);
1402 goto out; 1402 goto out;
1403 } 1403 }
1404 migratetype = MIGRATE_MOVABLE; 1404 migratetype = MIGRATE_MOVABLE;
1405 } 1405 }
1406 1406
1407 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1407 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1408 if (!cold) 1408 if (!cold)
1409 list_add(&page->lru, &pcp->lists[migratetype]); 1409 list_add(&page->lru, &pcp->lists[migratetype]);
1410 else 1410 else
1411 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1411 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1412 pcp->count++; 1412 pcp->count++;
1413 if (pcp->count >= pcp->high) { 1413 if (pcp->count >= pcp->high) {
1414 unsigned long batch = ACCESS_ONCE(pcp->batch); 1414 unsigned long batch = ACCESS_ONCE(pcp->batch);
1415 free_pcppages_bulk(zone, batch, pcp); 1415 free_pcppages_bulk(zone, batch, pcp);
1416 pcp->count -= batch; 1416 pcp->count -= batch;
1417 } 1417 }
1418 1418
1419 out: 1419 out:
1420 local_irq_restore(flags); 1420 local_irq_restore(flags);
1421 } 1421 }
1422 1422
1423 /* 1423 /*
1424 * Free a list of 0-order pages 1424 * Free a list of 0-order pages
1425 */ 1425 */
1426 void free_hot_cold_page_list(struct list_head *list, bool cold) 1426 void free_hot_cold_page_list(struct list_head *list, bool cold)
1427 { 1427 {
1428 struct page *page, *next; 1428 struct page *page, *next;
1429 1429
1430 list_for_each_entry_safe(page, next, list, lru) { 1430 list_for_each_entry_safe(page, next, list, lru) {
1431 trace_mm_page_free_batched(page, cold); 1431 trace_mm_page_free_batched(page, cold);
1432 free_hot_cold_page(page, cold); 1432 free_hot_cold_page(page, cold);
1433 } 1433 }
1434 } 1434 }
1435 1435
1436 /* 1436 /*
1437 * split_page takes a non-compound higher-order page, and splits it into 1437 * split_page takes a non-compound higher-order page, and splits it into
1438 * n (1<<order) sub-pages: page[0..n] 1438 * n (1<<order) sub-pages: page[0..n]
1439 * Each sub-page must be freed individually. 1439 * Each sub-page must be freed individually.
1440 * 1440 *
1441 * Note: this is probably too low level an operation for use in drivers. 1441 * Note: this is probably too low level an operation for use in drivers.
1442 * Please consult with lkml before using this in your driver. 1442 * Please consult with lkml before using this in your driver.
1443 */ 1443 */
1444 void split_page(struct page *page, unsigned int order) 1444 void split_page(struct page *page, unsigned int order)
1445 { 1445 {
1446 int i; 1446 int i;
1447 1447
1448 VM_BUG_ON(PageCompound(page)); 1448 VM_BUG_ON(PageCompound(page));
1449 VM_BUG_ON(!page_count(page)); 1449 VM_BUG_ON(!page_count(page));
1450 1450
1451 #ifdef CONFIG_KMEMCHECK 1451 #ifdef CONFIG_KMEMCHECK
1452 /* 1452 /*
1453 * Split shadow pages too, because free(page[0]) would 1453 * Split shadow pages too, because free(page[0]) would
1454 * otherwise free the whole shadow. 1454 * otherwise free the whole shadow.
1455 */ 1455 */
1456 if (kmemcheck_page_is_tracked(page)) 1456 if (kmemcheck_page_is_tracked(page))
1457 split_page(virt_to_page(page[0].shadow), order); 1457 split_page(virt_to_page(page[0].shadow), order);
1458 #endif 1458 #endif
1459 1459
1460 for (i = 1; i < (1 << order); i++) 1460 for (i = 1; i < (1 << order); i++)
1461 set_page_refcounted(page + i); 1461 set_page_refcounted(page + i);
1462 } 1462 }
1463 EXPORT_SYMBOL_GPL(split_page); 1463 EXPORT_SYMBOL_GPL(split_page);
1464 1464
1465 static int __isolate_free_page(struct page *page, unsigned int order) 1465 static int __isolate_free_page(struct page *page, unsigned int order)
1466 { 1466 {
1467 unsigned long watermark; 1467 unsigned long watermark;
1468 struct zone *zone; 1468 struct zone *zone;
1469 int mt; 1469 int mt;
1470 1470
1471 BUG_ON(!PageBuddy(page)); 1471 BUG_ON(!PageBuddy(page));
1472 1472
1473 zone = page_zone(page); 1473 zone = page_zone(page);
1474 mt = get_pageblock_migratetype(page); 1474 mt = get_pageblock_migratetype(page);
1475 1475
1476 if (!is_migrate_isolate(mt)) { 1476 if (!is_migrate_isolate(mt)) {
1477 /* Obey watermarks as if the page was being allocated */ 1477 /* Obey watermarks as if the page was being allocated */
1478 watermark = low_wmark_pages(zone) + (1 << order); 1478 watermark = low_wmark_pages(zone) + (1 << order);
1479 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1479 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1480 return 0; 1480 return 0;
1481 1481
1482 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1482 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1483 } 1483 }
1484 1484
1485 /* Remove page from free list */ 1485 /* Remove page from free list */
1486 list_del(&page->lru); 1486 list_del(&page->lru);
1487 zone->free_area[order].nr_free--; 1487 zone->free_area[order].nr_free--;
1488 rmv_page_order(page); 1488 rmv_page_order(page);
1489 1489
1490 /* Set the pageblock if the isolated page is at least a pageblock */ 1490 /* Set the pageblock if the isolated page is at least a pageblock */
1491 if (order >= pageblock_order - 1) { 1491 if (order >= pageblock_order - 1) {
1492 struct page *endpage = page + (1 << order) - 1; 1492 struct page *endpage = page + (1 << order) - 1;
1493 for (; page < endpage; page += pageblock_nr_pages) { 1493 for (; page < endpage; page += pageblock_nr_pages) {
1494 int mt = get_pageblock_migratetype(page); 1494 int mt = get_pageblock_migratetype(page);
1495 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1495 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1496 set_pageblock_migratetype(page, 1496 set_pageblock_migratetype(page,
1497 MIGRATE_MOVABLE); 1497 MIGRATE_MOVABLE);
1498 } 1498 }
1499 } 1499 }
1500 1500
1501 return 1UL << order; 1501 return 1UL << order;
1502 } 1502 }
1503 1503
1504 /* 1504 /*
1505 * Similar to split_page except the page is already free. As this is only 1505 * Similar to split_page except the page is already free. As this is only
1506 * being used for migration, the migratetype of the block also changes. 1506 * being used for migration, the migratetype of the block also changes.
1507 * As this is called with interrupts disabled, the caller is responsible 1507 * As this is called with interrupts disabled, the caller is responsible
1508 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1508 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1509 * are enabled. 1509 * are enabled.
1510 * 1510 *
1511 * Note: this is probably too low level an operation for use in drivers. 1511 * Note: this is probably too low level an operation for use in drivers.
1512 * Please consult with lkml before using this in your driver. 1512 * Please consult with lkml before using this in your driver.
1513 */ 1513 */
1514 int split_free_page(struct page *page) 1514 int split_free_page(struct page *page)
1515 { 1515 {
1516 unsigned int order; 1516 unsigned int order;
1517 int nr_pages; 1517 int nr_pages;
1518 1518
1519 order = page_order(page); 1519 order = page_order(page);
1520 1520
1521 nr_pages = __isolate_free_page(page, order); 1521 nr_pages = __isolate_free_page(page, order);
1522 if (!nr_pages) 1522 if (!nr_pages)
1523 return 0; 1523 return 0;
1524 1524
1525 /* Split into individual pages */ 1525 /* Split into individual pages */
1526 set_page_refcounted(page); 1526 set_page_refcounted(page);
1527 split_page(page, order); 1527 split_page(page, order);
1528 return nr_pages; 1528 return nr_pages;
1529 } 1529 }
1530 1530
1531 /* 1531 /*
1532 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1532 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1533 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1533 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1534 * or two. 1534 * or two.
1535 */ 1535 */
1536 static inline 1536 static inline
1537 struct page *buffered_rmqueue(struct zone *preferred_zone, 1537 struct page *buffered_rmqueue(struct zone *preferred_zone,
1538 struct zone *zone, unsigned int order, 1538 struct zone *zone, unsigned int order,
1539 gfp_t gfp_flags, int migratetype) 1539 gfp_t gfp_flags, int migratetype)
1540 { 1540 {
1541 unsigned long flags; 1541 unsigned long flags;
1542 struct page *page; 1542 struct page *page;
1543 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1543 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1544 1544
1545 again: 1545 again:
1546 if (likely(order == 0)) { 1546 if (likely(order == 0)) {
1547 struct per_cpu_pages *pcp; 1547 struct per_cpu_pages *pcp;
1548 struct list_head *list; 1548 struct list_head *list;
1549 1549
1550 local_irq_save(flags); 1550 local_irq_save(flags);
1551 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1551 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1552 list = &pcp->lists[migratetype]; 1552 list = &pcp->lists[migratetype];
1553 if (list_empty(list)) { 1553 if (list_empty(list)) {
1554 pcp->count += rmqueue_bulk(zone, 0, 1554 pcp->count += rmqueue_bulk(zone, 0,
1555 pcp->batch, list, 1555 pcp->batch, list,
1556 migratetype, cold); 1556 migratetype, cold);
1557 if (unlikely(list_empty(list))) 1557 if (unlikely(list_empty(list)))
1558 goto failed; 1558 goto failed;
1559 } 1559 }
1560 1560
1561 if (cold) 1561 if (cold)
1562 page = list_entry(list->prev, struct page, lru); 1562 page = list_entry(list->prev, struct page, lru);
1563 else 1563 else
1564 page = list_entry(list->next, struct page, lru); 1564 page = list_entry(list->next, struct page, lru);
1565 1565
1566 list_del(&page->lru); 1566 list_del(&page->lru);
1567 pcp->count--; 1567 pcp->count--;
1568 } else { 1568 } else {
1569 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1569 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1570 /* 1570 /*
1571 * __GFP_NOFAIL is not to be used in new code. 1571 * __GFP_NOFAIL is not to be used in new code.
1572 * 1572 *
1573 * All __GFP_NOFAIL callers should be fixed so that they 1573 * All __GFP_NOFAIL callers should be fixed so that they
1574 * properly detect and handle allocation failures. 1574 * properly detect and handle allocation failures.
1575 * 1575 *
1576 * We most definitely don't want callers attempting to 1576 * We most definitely don't want callers attempting to
1577 * allocate greater than order-1 page units with 1577 * allocate greater than order-1 page units with
1578 * __GFP_NOFAIL. 1578 * __GFP_NOFAIL.
1579 */ 1579 */
1580 WARN_ON_ONCE(order > 1); 1580 WARN_ON_ONCE(order > 1);
1581 } 1581 }
1582 spin_lock_irqsave(&zone->lock, flags); 1582 spin_lock_irqsave(&zone->lock, flags);
1583 page = __rmqueue(zone, order, migratetype); 1583 page = __rmqueue(zone, order, migratetype);
1584 spin_unlock(&zone->lock); 1584 spin_unlock(&zone->lock);
1585 if (!page) 1585 if (!page)
1586 goto failed; 1586 goto failed;
1587 __mod_zone_freepage_state(zone, -(1 << order), 1587 __mod_zone_freepage_state(zone, -(1 << order),
1588 get_freepage_migratetype(page)); 1588 get_freepage_migratetype(page));
1589 } 1589 }
1590 1590
1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1592 1592
1593 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1593 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1594 zone_statistics(preferred_zone, zone, gfp_flags); 1594 zone_statistics(preferred_zone, zone, gfp_flags);
1595 local_irq_restore(flags); 1595 local_irq_restore(flags);
1596 1596
1597 VM_BUG_ON(bad_range(zone, page)); 1597 VM_BUG_ON(bad_range(zone, page));
1598 if (prep_new_page(page, order, gfp_flags)) 1598 if (prep_new_page(page, order, gfp_flags))
1599 goto again; 1599 goto again;
1600 return page; 1600 return page;
1601 1601
1602 failed: 1602 failed:
1603 local_irq_restore(flags); 1603 local_irq_restore(flags);
1604 return NULL; 1604 return NULL;
1605 } 1605 }
1606 1606
1607 #ifdef CONFIG_FAIL_PAGE_ALLOC 1607 #ifdef CONFIG_FAIL_PAGE_ALLOC
1608 1608
1609 static struct { 1609 static struct {
1610 struct fault_attr attr; 1610 struct fault_attr attr;
1611 1611
1612 u32 ignore_gfp_highmem; 1612 u32 ignore_gfp_highmem;
1613 u32 ignore_gfp_wait; 1613 u32 ignore_gfp_wait;
1614 u32 min_order; 1614 u32 min_order;
1615 } fail_page_alloc = { 1615 } fail_page_alloc = {
1616 .attr = FAULT_ATTR_INITIALIZER, 1616 .attr = FAULT_ATTR_INITIALIZER,
1617 .ignore_gfp_wait = 1, 1617 .ignore_gfp_wait = 1,
1618 .ignore_gfp_highmem = 1, 1618 .ignore_gfp_highmem = 1,
1619 .min_order = 1, 1619 .min_order = 1,
1620 }; 1620 };
1621 1621
1622 static int __init setup_fail_page_alloc(char *str) 1622 static int __init setup_fail_page_alloc(char *str)
1623 { 1623 {
1624 return setup_fault_attr(&fail_page_alloc.attr, str); 1624 return setup_fault_attr(&fail_page_alloc.attr, str);
1625 } 1625 }
1626 __setup("fail_page_alloc=", setup_fail_page_alloc); 1626 __setup("fail_page_alloc=", setup_fail_page_alloc);
1627 1627
1628 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1628 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1629 { 1629 {
1630 if (order < fail_page_alloc.min_order) 1630 if (order < fail_page_alloc.min_order)
1631 return false; 1631 return false;
1632 if (gfp_mask & __GFP_NOFAIL) 1632 if (gfp_mask & __GFP_NOFAIL)
1633 return false; 1633 return false;
1634 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1634 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1635 return false; 1635 return false;
1636 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1636 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1637 return false; 1637 return false;
1638 1638
1639 return should_fail(&fail_page_alloc.attr, 1 << order); 1639 return should_fail(&fail_page_alloc.attr, 1 << order);
1640 } 1640 }
1641 1641
1642 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1642 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1643 1643
1644 static int __init fail_page_alloc_debugfs(void) 1644 static int __init fail_page_alloc_debugfs(void)
1645 { 1645 {
1646 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1646 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1647 struct dentry *dir; 1647 struct dentry *dir;
1648 1648
1649 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1649 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1650 &fail_page_alloc.attr); 1650 &fail_page_alloc.attr);
1651 if (IS_ERR(dir)) 1651 if (IS_ERR(dir))
1652 return PTR_ERR(dir); 1652 return PTR_ERR(dir);
1653 1653
1654 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1654 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1655 &fail_page_alloc.ignore_gfp_wait)) 1655 &fail_page_alloc.ignore_gfp_wait))
1656 goto fail; 1656 goto fail;
1657 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1657 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1658 &fail_page_alloc.ignore_gfp_highmem)) 1658 &fail_page_alloc.ignore_gfp_highmem))
1659 goto fail; 1659 goto fail;
1660 if (!debugfs_create_u32("min-order", mode, dir, 1660 if (!debugfs_create_u32("min-order", mode, dir,
1661 &fail_page_alloc.min_order)) 1661 &fail_page_alloc.min_order))
1662 goto fail; 1662 goto fail;
1663 1663
1664 return 0; 1664 return 0;
1665 fail: 1665 fail:
1666 debugfs_remove_recursive(dir); 1666 debugfs_remove_recursive(dir);
1667 1667
1668 return -ENOMEM; 1668 return -ENOMEM;
1669 } 1669 }
1670 1670
1671 late_initcall(fail_page_alloc_debugfs); 1671 late_initcall(fail_page_alloc_debugfs);
1672 1672
1673 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1673 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1674 1674
1675 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1675 #else /* CONFIG_FAIL_PAGE_ALLOC */
1676 1676
1677 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1677 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1678 { 1678 {
1679 return false; 1679 return false;
1680 } 1680 }
1681 1681
1682 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1682 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1683 1683
1684 /* 1684 /*
1685 * Return true if free pages are above 'mark'. This takes into account the order 1685 * Return true if free pages are above 'mark'. This takes into account the order
1686 * of the allocation. 1686 * of the allocation.
1687 */ 1687 */
1688 static bool __zone_watermark_ok(struct zone *z, unsigned int order, 1688 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1689 unsigned long mark, int classzone_idx, int alloc_flags, 1689 unsigned long mark, int classzone_idx, int alloc_flags,
1690 long free_pages) 1690 long free_pages)
1691 { 1691 {
1692 /* free_pages my go negative - that's OK */ 1692 /* free_pages my go negative - that's OK */
1693 long min = mark; 1693 long min = mark;
1694 int o; 1694 int o;
1695 long free_cma = 0; 1695 long free_cma = 0;
1696 1696
1697 free_pages -= (1 << order) - 1; 1697 free_pages -= (1 << order) - 1;
1698 if (alloc_flags & ALLOC_HIGH) 1698 if (alloc_flags & ALLOC_HIGH)
1699 min -= min / 2; 1699 min -= min / 2;
1700 if (alloc_flags & ALLOC_HARDER) 1700 if (alloc_flags & ALLOC_HARDER)
1701 min -= min / 4; 1701 min -= min / 4;
1702 #ifdef CONFIG_CMA 1702 #ifdef CONFIG_CMA
1703 /* If allocation can't use CMA areas don't use free CMA pages */ 1703 /* If allocation can't use CMA areas don't use free CMA pages */
1704 if (!(alloc_flags & ALLOC_CMA)) 1704 if (!(alloc_flags & ALLOC_CMA))
1705 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1705 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1706 #endif 1706 #endif
1707 1707
1708 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) 1708 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
1709 return false; 1709 return false;
1710 for (o = 0; o < order; o++) { 1710 for (o = 0; o < order; o++) {
1711 /* At the next order, this order's pages become unavailable */ 1711 /* At the next order, this order's pages become unavailable */
1712 free_pages -= z->free_area[o].nr_free << o; 1712 free_pages -= z->free_area[o].nr_free << o;
1713 1713
1714 /* Require fewer higher order pages to be free */ 1714 /* Require fewer higher order pages to be free */
1715 min >>= 1; 1715 min >>= 1;
1716 1716
1717 if (free_pages <= min) 1717 if (free_pages <= min)
1718 return false; 1718 return false;
1719 } 1719 }
1720 return true; 1720 return true;
1721 } 1721 }
1722 1722
1723 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 1723 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1724 int classzone_idx, int alloc_flags) 1724 int classzone_idx, int alloc_flags)
1725 { 1725 {
1726 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1726 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1727 zone_page_state(z, NR_FREE_PAGES)); 1727 zone_page_state(z, NR_FREE_PAGES));
1728 } 1728 }
1729 1729
1730 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 1730 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1731 unsigned long mark, int classzone_idx, int alloc_flags) 1731 unsigned long mark, int classzone_idx, int alloc_flags)
1732 { 1732 {
1733 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1733 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1734 1734
1735 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1735 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1736 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1736 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1737 1737
1738 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1738 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1739 free_pages); 1739 free_pages);
1740 } 1740 }
1741 1741
1742 #ifdef CONFIG_NUMA 1742 #ifdef CONFIG_NUMA
1743 /* 1743 /*
1744 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1744 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1745 * skip over zones that are not allowed by the cpuset, or that have 1745 * skip over zones that are not allowed by the cpuset, or that have
1746 * been recently (in last second) found to be nearly full. See further 1746 * been recently (in last second) found to be nearly full. See further
1747 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1747 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1748 * that have to skip over a lot of full or unallowed zones. 1748 * that have to skip over a lot of full or unallowed zones.
1749 * 1749 *
1750 * If the zonelist cache is present in the passed in zonelist, then 1750 * If the zonelist cache is present in the passed in zonelist, then
1751 * returns a pointer to the allowed node mask (either the current 1751 * returns a pointer to the allowed node mask (either the current
1752 * tasks mems_allowed, or node_states[N_MEMORY].) 1752 * tasks mems_allowed, or node_states[N_MEMORY].)
1753 * 1753 *
1754 * If the zonelist cache is not available for this zonelist, does 1754 * If the zonelist cache is not available for this zonelist, does
1755 * nothing and returns NULL. 1755 * nothing and returns NULL.
1756 * 1756 *
1757 * If the fullzones BITMAP in the zonelist cache is stale (more than 1757 * If the fullzones BITMAP in the zonelist cache is stale (more than
1758 * a second since last zap'd) then we zap it out (clear its bits.) 1758 * a second since last zap'd) then we zap it out (clear its bits.)
1759 * 1759 *
1760 * We hold off even calling zlc_setup, until after we've checked the 1760 * We hold off even calling zlc_setup, until after we've checked the
1761 * first zone in the zonelist, on the theory that most allocations will 1761 * first zone in the zonelist, on the theory that most allocations will
1762 * be satisfied from that first zone, so best to examine that zone as 1762 * be satisfied from that first zone, so best to examine that zone as
1763 * quickly as we can. 1763 * quickly as we can.
1764 */ 1764 */
1765 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1765 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1766 { 1766 {
1767 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1767 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1768 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1768 nodemask_t *allowednodes; /* zonelist_cache approximation */
1769 1769
1770 zlc = zonelist->zlcache_ptr; 1770 zlc = zonelist->zlcache_ptr;
1771 if (!zlc) 1771 if (!zlc)
1772 return NULL; 1772 return NULL;
1773 1773
1774 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1774 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1775 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1775 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1776 zlc->last_full_zap = jiffies; 1776 zlc->last_full_zap = jiffies;
1777 } 1777 }
1778 1778
1779 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1779 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1780 &cpuset_current_mems_allowed : 1780 &cpuset_current_mems_allowed :
1781 &node_states[N_MEMORY]; 1781 &node_states[N_MEMORY];
1782 return allowednodes; 1782 return allowednodes;
1783 } 1783 }
1784 1784
1785 /* 1785 /*
1786 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1786 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1787 * if it is worth looking at further for free memory: 1787 * if it is worth looking at further for free memory:
1788 * 1) Check that the zone isn't thought to be full (doesn't have its 1788 * 1) Check that the zone isn't thought to be full (doesn't have its
1789 * bit set in the zonelist_cache fullzones BITMAP). 1789 * bit set in the zonelist_cache fullzones BITMAP).
1790 * 2) Check that the zones node (obtained from the zonelist_cache 1790 * 2) Check that the zones node (obtained from the zonelist_cache
1791 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1791 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1792 * Return true (non-zero) if zone is worth looking at further, or 1792 * Return true (non-zero) if zone is worth looking at further, or
1793 * else return false (zero) if it is not. 1793 * else return false (zero) if it is not.
1794 * 1794 *
1795 * This check -ignores- the distinction between various watermarks, 1795 * This check -ignores- the distinction between various watermarks,
1796 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1796 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1797 * found to be full for any variation of these watermarks, it will 1797 * found to be full for any variation of these watermarks, it will
1798 * be considered full for up to one second by all requests, unless 1798 * be considered full for up to one second by all requests, unless
1799 * we are so low on memory on all allowed nodes that we are forced 1799 * we are so low on memory on all allowed nodes that we are forced
1800 * into the second scan of the zonelist. 1800 * into the second scan of the zonelist.
1801 * 1801 *
1802 * In the second scan we ignore this zonelist cache and exactly 1802 * In the second scan we ignore this zonelist cache and exactly
1803 * apply the watermarks to all zones, even it is slower to do so. 1803 * apply the watermarks to all zones, even it is slower to do so.
1804 * We are low on memory in the second scan, and should leave no stone 1804 * We are low on memory in the second scan, and should leave no stone
1805 * unturned looking for a free page. 1805 * unturned looking for a free page.
1806 */ 1806 */
1807 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1807 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1808 nodemask_t *allowednodes) 1808 nodemask_t *allowednodes)
1809 { 1809 {
1810 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1810 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1811 int i; /* index of *z in zonelist zones */ 1811 int i; /* index of *z in zonelist zones */
1812 int n; /* node that zone *z is on */ 1812 int n; /* node that zone *z is on */
1813 1813
1814 zlc = zonelist->zlcache_ptr; 1814 zlc = zonelist->zlcache_ptr;
1815 if (!zlc) 1815 if (!zlc)
1816 return 1; 1816 return 1;
1817 1817
1818 i = z - zonelist->_zonerefs; 1818 i = z - zonelist->_zonerefs;
1819 n = zlc->z_to_n[i]; 1819 n = zlc->z_to_n[i];
1820 1820
1821 /* This zone is worth trying if it is allowed but not full */ 1821 /* This zone is worth trying if it is allowed but not full */
1822 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1822 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1823 } 1823 }
1824 1824
1825 /* 1825 /*
1826 * Given 'z' scanning a zonelist, set the corresponding bit in 1826 * Given 'z' scanning a zonelist, set the corresponding bit in
1827 * zlc->fullzones, so that subsequent attempts to allocate a page 1827 * zlc->fullzones, so that subsequent attempts to allocate a page
1828 * from that zone don't waste time re-examining it. 1828 * from that zone don't waste time re-examining it.
1829 */ 1829 */
1830 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1830 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1831 { 1831 {
1832 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1832 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1833 int i; /* index of *z in zonelist zones */ 1833 int i; /* index of *z in zonelist zones */
1834 1834
1835 zlc = zonelist->zlcache_ptr; 1835 zlc = zonelist->zlcache_ptr;
1836 if (!zlc) 1836 if (!zlc)
1837 return; 1837 return;
1838 1838
1839 i = z - zonelist->_zonerefs; 1839 i = z - zonelist->_zonerefs;
1840 1840
1841 set_bit(i, zlc->fullzones); 1841 set_bit(i, zlc->fullzones);
1842 } 1842 }
1843 1843
1844 /* 1844 /*
1845 * clear all zones full, called after direct reclaim makes progress so that 1845 * clear all zones full, called after direct reclaim makes progress so that
1846 * a zone that was recently full is not skipped over for up to a second 1846 * a zone that was recently full is not skipped over for up to a second
1847 */ 1847 */
1848 static void zlc_clear_zones_full(struct zonelist *zonelist) 1848 static void zlc_clear_zones_full(struct zonelist *zonelist)
1849 { 1849 {
1850 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1850 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1851 1851
1852 zlc = zonelist->zlcache_ptr; 1852 zlc = zonelist->zlcache_ptr;
1853 if (!zlc) 1853 if (!zlc)
1854 return; 1854 return;
1855 1855
1856 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1856 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1857 } 1857 }
1858 1858
1859 static bool zone_local(struct zone *local_zone, struct zone *zone) 1859 static bool zone_local(struct zone *local_zone, struct zone *zone)
1860 { 1860 {
1861 return local_zone->node == zone->node; 1861 return local_zone->node == zone->node;
1862 } 1862 }
1863 1863
1864 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1864 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1865 { 1865 {
1866 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1866 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1867 } 1867 }
1868 1868
1869 static void __paginginit init_zone_allows_reclaim(int nid) 1869 static void __paginginit init_zone_allows_reclaim(int nid)
1870 { 1870 {
1871 int i; 1871 int i;
1872 1872
1873 for_each_node_state(i, N_MEMORY) 1873 for_each_node_state(i, N_MEMORY)
1874 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1874 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1875 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1875 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1876 else 1876 else
1877 zone_reclaim_mode = 1; 1877 zone_reclaim_mode = 1;
1878 } 1878 }
1879 1879
1880 #else /* CONFIG_NUMA */ 1880 #else /* CONFIG_NUMA */
1881 1881
1882 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1882 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1883 { 1883 {
1884 return NULL; 1884 return NULL;
1885 } 1885 }
1886 1886
1887 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1887 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1888 nodemask_t *allowednodes) 1888 nodemask_t *allowednodes)
1889 { 1889 {
1890 return 1; 1890 return 1;
1891 } 1891 }
1892 1892
1893 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1893 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1894 { 1894 {
1895 } 1895 }
1896 1896
1897 static void zlc_clear_zones_full(struct zonelist *zonelist) 1897 static void zlc_clear_zones_full(struct zonelist *zonelist)
1898 { 1898 {
1899 } 1899 }
1900 1900
1901 static bool zone_local(struct zone *local_zone, struct zone *zone) 1901 static bool zone_local(struct zone *local_zone, struct zone *zone)
1902 { 1902 {
1903 return true; 1903 return true;
1904 } 1904 }
1905 1905
1906 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1906 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1907 { 1907 {
1908 return true; 1908 return true;
1909 } 1909 }
1910 1910
1911 static inline void init_zone_allows_reclaim(int nid) 1911 static inline void init_zone_allows_reclaim(int nid)
1912 { 1912 {
1913 } 1913 }
1914 #endif /* CONFIG_NUMA */ 1914 #endif /* CONFIG_NUMA */
1915 1915
1916 /* 1916 /*
1917 * get_page_from_freelist goes through the zonelist trying to allocate 1917 * get_page_from_freelist goes through the zonelist trying to allocate
1918 * a page. 1918 * a page.
1919 */ 1919 */
1920 static struct page * 1920 static struct page *
1921 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1921 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1922 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1922 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1923 struct zone *preferred_zone, int classzone_idx, int migratetype) 1923 struct zone *preferred_zone, int classzone_idx, int migratetype)
1924 { 1924 {
1925 struct zoneref *z; 1925 struct zoneref *z;
1926 struct page *page = NULL; 1926 struct page *page = NULL;
1927 struct zone *zone; 1927 struct zone *zone;
1928 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1928 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1929 int zlc_active = 0; /* set if using zonelist_cache */ 1929 int zlc_active = 0; /* set if using zonelist_cache */
1930 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1930 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1931 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1931 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1932 (gfp_mask & __GFP_WRITE); 1932 (gfp_mask & __GFP_WRITE);
1933 1933
1934 zonelist_scan: 1934 zonelist_scan:
1935 /* 1935 /*
1936 * Scan zonelist, looking for a zone with enough free. 1936 * Scan zonelist, looking for a zone with enough free.
1937 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1937 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1938 */ 1938 */
1939 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1939 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1940 high_zoneidx, nodemask) { 1940 high_zoneidx, nodemask) {
1941 unsigned long mark; 1941 unsigned long mark;
1942 1942
1943 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1943 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1944 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1944 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1945 continue; 1945 continue;
1946 if (cpusets_enabled() && 1946 if (cpusets_enabled() &&
1947 (alloc_flags & ALLOC_CPUSET) && 1947 (alloc_flags & ALLOC_CPUSET) &&
1948 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1948 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1949 continue; 1949 continue;
1950 /* 1950 /*
1951 * Distribute pages in proportion to the individual 1951 * Distribute pages in proportion to the individual
1952 * zone size to ensure fair page aging. The zone a 1952 * zone size to ensure fair page aging. The zone a
1953 * page was allocated in should have no effect on the 1953 * page was allocated in should have no effect on the
1954 * time the page has in memory before being reclaimed. 1954 * time the page has in memory before being reclaimed.
1955 */ 1955 */
1956 if (alloc_flags & ALLOC_FAIR) { 1956 if (alloc_flags & ALLOC_FAIR) {
1957 if (!zone_local(preferred_zone, zone)) 1957 if (!zone_local(preferred_zone, zone))
1958 continue; 1958 break;
1959 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1959 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1960 continue; 1960 continue;
1961 } 1961 }
1962 /* 1962 /*
1963 * When allocating a page cache page for writing, we 1963 * When allocating a page cache page for writing, we
1964 * want to get it from a zone that is within its dirty 1964 * want to get it from a zone that is within its dirty
1965 * limit, such that no single zone holds more than its 1965 * limit, such that no single zone holds more than its
1966 * proportional share of globally allowed dirty pages. 1966 * proportional share of globally allowed dirty pages.
1967 * The dirty limits take into account the zone's 1967 * The dirty limits take into account the zone's
1968 * lowmem reserves and high watermark so that kswapd 1968 * lowmem reserves and high watermark so that kswapd
1969 * should be able to balance it without having to 1969 * should be able to balance it without having to
1970 * write pages from its LRU list. 1970 * write pages from its LRU list.
1971 * 1971 *
1972 * This may look like it could increase pressure on 1972 * This may look like it could increase pressure on
1973 * lower zones by failing allocations in higher zones 1973 * lower zones by failing allocations in higher zones
1974 * before they are full. But the pages that do spill 1974 * before they are full. But the pages that do spill
1975 * over are limited as the lower zones are protected 1975 * over are limited as the lower zones are protected
1976 * by this very same mechanism. It should not become 1976 * by this very same mechanism. It should not become
1977 * a practical burden to them. 1977 * a practical burden to them.
1978 * 1978 *
1979 * XXX: For now, allow allocations to potentially 1979 * XXX: For now, allow allocations to potentially
1980 * exceed the per-zone dirty limit in the slowpath 1980 * exceed the per-zone dirty limit in the slowpath
1981 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1981 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1982 * which is important when on a NUMA setup the allowed 1982 * which is important when on a NUMA setup the allowed
1983 * zones are together not big enough to reach the 1983 * zones are together not big enough to reach the
1984 * global limit. The proper fix for these situations 1984 * global limit. The proper fix for these situations
1985 * will require awareness of zones in the 1985 * will require awareness of zones in the
1986 * dirty-throttling and the flusher threads. 1986 * dirty-throttling and the flusher threads.
1987 */ 1987 */
1988 if (consider_zone_dirty && !zone_dirty_ok(zone)) 1988 if (consider_zone_dirty && !zone_dirty_ok(zone))
1989 continue; 1989 continue;
1990 1990
1991 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1991 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1992 if (!zone_watermark_ok(zone, order, mark, 1992 if (!zone_watermark_ok(zone, order, mark,
1993 classzone_idx, alloc_flags)) { 1993 classzone_idx, alloc_flags)) {
1994 int ret; 1994 int ret;
1995 1995
1996 /* Checked here to keep the fast path fast */ 1996 /* Checked here to keep the fast path fast */
1997 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1997 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1998 if (alloc_flags & ALLOC_NO_WATERMARKS) 1998 if (alloc_flags & ALLOC_NO_WATERMARKS)
1999 goto try_this_zone; 1999 goto try_this_zone;
2000 2000
2001 if (IS_ENABLED(CONFIG_NUMA) && 2001 if (IS_ENABLED(CONFIG_NUMA) &&
2002 !did_zlc_setup && nr_online_nodes > 1) { 2002 !did_zlc_setup && nr_online_nodes > 1) {
2003 /* 2003 /*
2004 * we do zlc_setup if there are multiple nodes 2004 * we do zlc_setup if there are multiple nodes
2005 * and before considering the first zone allowed 2005 * and before considering the first zone allowed
2006 * by the cpuset. 2006 * by the cpuset.
2007 */ 2007 */
2008 allowednodes = zlc_setup(zonelist, alloc_flags); 2008 allowednodes = zlc_setup(zonelist, alloc_flags);
2009 zlc_active = 1; 2009 zlc_active = 1;
2010 did_zlc_setup = 1; 2010 did_zlc_setup = 1;
2011 } 2011 }
2012 2012
2013 if (zone_reclaim_mode == 0 || 2013 if (zone_reclaim_mode == 0 ||
2014 !zone_allows_reclaim(preferred_zone, zone)) 2014 !zone_allows_reclaim(preferred_zone, zone))
2015 goto this_zone_full; 2015 goto this_zone_full;
2016 2016
2017 /* 2017 /*
2018 * As we may have just activated ZLC, check if the first 2018 * As we may have just activated ZLC, check if the first
2019 * eligible zone has failed zone_reclaim recently. 2019 * eligible zone has failed zone_reclaim recently.
2020 */ 2020 */
2021 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2021 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
2022 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 2022 !zlc_zone_worth_trying(zonelist, z, allowednodes))
2023 continue; 2023 continue;
2024 2024
2025 ret = zone_reclaim(zone, gfp_mask, order); 2025 ret = zone_reclaim(zone, gfp_mask, order);
2026 switch (ret) { 2026 switch (ret) {
2027 case ZONE_RECLAIM_NOSCAN: 2027 case ZONE_RECLAIM_NOSCAN:
2028 /* did not scan */ 2028 /* did not scan */
2029 continue; 2029 continue;
2030 case ZONE_RECLAIM_FULL: 2030 case ZONE_RECLAIM_FULL:
2031 /* scanned but unreclaimable */ 2031 /* scanned but unreclaimable */
2032 continue; 2032 continue;
2033 default: 2033 default:
2034 /* did we reclaim enough */ 2034 /* did we reclaim enough */
2035 if (zone_watermark_ok(zone, order, mark, 2035 if (zone_watermark_ok(zone, order, mark,
2036 classzone_idx, alloc_flags)) 2036 classzone_idx, alloc_flags))
2037 goto try_this_zone; 2037 goto try_this_zone;
2038 2038
2039 /* 2039 /*
2040 * Failed to reclaim enough to meet watermark. 2040 * Failed to reclaim enough to meet watermark.
2041 * Only mark the zone full if checking the min 2041 * Only mark the zone full if checking the min
2042 * watermark or if we failed to reclaim just 2042 * watermark or if we failed to reclaim just
2043 * 1<<order pages or else the page allocator 2043 * 1<<order pages or else the page allocator
2044 * fastpath will prematurely mark zones full 2044 * fastpath will prematurely mark zones full
2045 * when the watermark is between the low and 2045 * when the watermark is between the low and
2046 * min watermarks. 2046 * min watermarks.
2047 */ 2047 */
2048 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2048 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2049 ret == ZONE_RECLAIM_SOME) 2049 ret == ZONE_RECLAIM_SOME)
2050 goto this_zone_full; 2050 goto this_zone_full;
2051 2051
2052 continue; 2052 continue;
2053 } 2053 }
2054 } 2054 }
2055 2055
2056 try_this_zone: 2056 try_this_zone:
2057 page = buffered_rmqueue(preferred_zone, zone, order, 2057 page = buffered_rmqueue(preferred_zone, zone, order,
2058 gfp_mask, migratetype); 2058 gfp_mask, migratetype);
2059 if (page) 2059 if (page)
2060 break; 2060 break;
2061 this_zone_full: 2061 this_zone_full:
2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2063 zlc_mark_zone_full(zonelist, z); 2063 zlc_mark_zone_full(zonelist, z);
2064 } 2064 }
2065 2065
2066 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2066 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2067 /* Disable zlc cache for second zonelist scan */ 2067 /* Disable zlc cache for second zonelist scan */
2068 zlc_active = 0; 2068 zlc_active = 0;
2069 goto zonelist_scan; 2069 goto zonelist_scan;
2070 } 2070 }
2071 2071
2072 if (page) 2072 if (page)
2073 /* 2073 /*
2074 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2074 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2075 * necessary to allocate the page. The expectation is 2075 * necessary to allocate the page. The expectation is
2076 * that the caller is taking steps that will free more 2076 * that the caller is taking steps that will free more
2077 * memory. The caller should avoid the page being used 2077 * memory. The caller should avoid the page being used
2078 * for !PFMEMALLOC purposes. 2078 * for !PFMEMALLOC purposes.
2079 */ 2079 */
2080 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2080 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2081 2081
2082 return page; 2082 return page;
2083 } 2083 }
2084 2084
2085 /* 2085 /*
2086 * Large machines with many possible nodes should not always dump per-node 2086 * Large machines with many possible nodes should not always dump per-node
2087 * meminfo in irq context. 2087 * meminfo in irq context.
2088 */ 2088 */
2089 static inline bool should_suppress_show_mem(void) 2089 static inline bool should_suppress_show_mem(void)
2090 { 2090 {
2091 bool ret = false; 2091 bool ret = false;
2092 2092
2093 #if NODES_SHIFT > 8 2093 #if NODES_SHIFT > 8
2094 ret = in_interrupt(); 2094 ret = in_interrupt();
2095 #endif 2095 #endif
2096 return ret; 2096 return ret;
2097 } 2097 }
2098 2098
2099 static DEFINE_RATELIMIT_STATE(nopage_rs, 2099 static DEFINE_RATELIMIT_STATE(nopage_rs,
2100 DEFAULT_RATELIMIT_INTERVAL, 2100 DEFAULT_RATELIMIT_INTERVAL,
2101 DEFAULT_RATELIMIT_BURST); 2101 DEFAULT_RATELIMIT_BURST);
2102 2102
2103 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2103 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2104 { 2104 {
2105 unsigned int filter = SHOW_MEM_FILTER_NODES; 2105 unsigned int filter = SHOW_MEM_FILTER_NODES;
2106 2106
2107 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2107 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2108 debug_guardpage_minorder() > 0) 2108 debug_guardpage_minorder() > 0)
2109 return; 2109 return;
2110 2110
2111 /* 2111 /*
2112 * Walking all memory to count page types is very expensive and should 2112 * Walking all memory to count page types is very expensive and should
2113 * be inhibited in non-blockable contexts. 2113 * be inhibited in non-blockable contexts.
2114 */ 2114 */
2115 if (!(gfp_mask & __GFP_WAIT)) 2115 if (!(gfp_mask & __GFP_WAIT))
2116 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2116 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2117 2117
2118 /* 2118 /*
2119 * This documents exceptions given to allocations in certain 2119 * This documents exceptions given to allocations in certain
2120 * contexts that are allowed to allocate outside current's set 2120 * contexts that are allowed to allocate outside current's set
2121 * of allowed nodes. 2121 * of allowed nodes.
2122 */ 2122 */
2123 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2123 if (!(gfp_mask & __GFP_NOMEMALLOC))
2124 if (test_thread_flag(TIF_MEMDIE) || 2124 if (test_thread_flag(TIF_MEMDIE) ||
2125 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2125 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2126 filter &= ~SHOW_MEM_FILTER_NODES; 2126 filter &= ~SHOW_MEM_FILTER_NODES;
2127 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2127 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2128 filter &= ~SHOW_MEM_FILTER_NODES; 2128 filter &= ~SHOW_MEM_FILTER_NODES;
2129 2129
2130 if (fmt) { 2130 if (fmt) {
2131 struct va_format vaf; 2131 struct va_format vaf;
2132 va_list args; 2132 va_list args;
2133 2133
2134 va_start(args, fmt); 2134 va_start(args, fmt);
2135 2135
2136 vaf.fmt = fmt; 2136 vaf.fmt = fmt;
2137 vaf.va = &args; 2137 vaf.va = &args;
2138 2138
2139 pr_warn("%pV", &vaf); 2139 pr_warn("%pV", &vaf);
2140 2140
2141 va_end(args); 2141 va_end(args);
2142 } 2142 }
2143 2143
2144 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2144 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2145 current->comm, order, gfp_mask); 2145 current->comm, order, gfp_mask);
2146 2146
2147 dump_stack(); 2147 dump_stack();
2148 if (!should_suppress_show_mem()) 2148 if (!should_suppress_show_mem())
2149 show_mem(filter); 2149 show_mem(filter);
2150 } 2150 }
2151 2151
2152 static inline int 2152 static inline int
2153 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2153 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2154 unsigned long did_some_progress, 2154 unsigned long did_some_progress,
2155 unsigned long pages_reclaimed) 2155 unsigned long pages_reclaimed)
2156 { 2156 {
2157 /* Do not loop if specifically requested */ 2157 /* Do not loop if specifically requested */
2158 if (gfp_mask & __GFP_NORETRY) 2158 if (gfp_mask & __GFP_NORETRY)
2159 return 0; 2159 return 0;
2160 2160
2161 /* Always retry if specifically requested */ 2161 /* Always retry if specifically requested */
2162 if (gfp_mask & __GFP_NOFAIL) 2162 if (gfp_mask & __GFP_NOFAIL)
2163 return 1; 2163 return 1;
2164 2164
2165 /* 2165 /*
2166 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2166 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2167 * making forward progress without invoking OOM. Suspend also disables 2167 * making forward progress without invoking OOM. Suspend also disables
2168 * storage devices so kswapd will not help. Bail if we are suspending. 2168 * storage devices so kswapd will not help. Bail if we are suspending.
2169 */ 2169 */
2170 if (!did_some_progress && pm_suspended_storage()) 2170 if (!did_some_progress && pm_suspended_storage())
2171 return 0; 2171 return 0;
2172 2172
2173 /* 2173 /*
2174 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2174 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2175 * means __GFP_NOFAIL, but that may not be true in other 2175 * means __GFP_NOFAIL, but that may not be true in other
2176 * implementations. 2176 * implementations.
2177 */ 2177 */
2178 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2178 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2179 return 1; 2179 return 1;
2180 2180
2181 /* 2181 /*
2182 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2182 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2183 * specified, then we retry until we no longer reclaim any pages 2183 * specified, then we retry until we no longer reclaim any pages
2184 * (above), or we've reclaimed an order of pages at least as 2184 * (above), or we've reclaimed an order of pages at least as
2185 * large as the allocation's order. In both cases, if the 2185 * large as the allocation's order. In both cases, if the
2186 * allocation still fails, we stop retrying. 2186 * allocation still fails, we stop retrying.
2187 */ 2187 */
2188 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2188 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2189 return 1; 2189 return 1;
2190 2190
2191 return 0; 2191 return 0;
2192 } 2192 }
2193 2193
2194 static inline struct page * 2194 static inline struct page *
2195 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2195 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2196 struct zonelist *zonelist, enum zone_type high_zoneidx, 2196 struct zonelist *zonelist, enum zone_type high_zoneidx,
2197 nodemask_t *nodemask, struct zone *preferred_zone, 2197 nodemask_t *nodemask, struct zone *preferred_zone,
2198 int classzone_idx, int migratetype) 2198 int classzone_idx, int migratetype)
2199 { 2199 {
2200 struct page *page; 2200 struct page *page;
2201 2201
2202 /* Acquire the OOM killer lock for the zones in zonelist */ 2202 /* Acquire the OOM killer lock for the zones in zonelist */
2203 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2203 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2204 schedule_timeout_uninterruptible(1); 2204 schedule_timeout_uninterruptible(1);
2205 return NULL; 2205 return NULL;
2206 } 2206 }
2207 2207
2208 /* 2208 /*
2209 * Go through the zonelist yet one more time, keep very high watermark 2209 * Go through the zonelist yet one more time, keep very high watermark
2210 * here, this is only to catch a parallel oom killing, we must fail if 2210 * here, this is only to catch a parallel oom killing, we must fail if
2211 * we're still under heavy pressure. 2211 * we're still under heavy pressure.
2212 */ 2212 */
2213 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2213 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2214 order, zonelist, high_zoneidx, 2214 order, zonelist, high_zoneidx,
2215 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2215 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2216 preferred_zone, classzone_idx, migratetype); 2216 preferred_zone, classzone_idx, migratetype);
2217 if (page) 2217 if (page)
2218 goto out; 2218 goto out;
2219 2219
2220 if (!(gfp_mask & __GFP_NOFAIL)) { 2220 if (!(gfp_mask & __GFP_NOFAIL)) {
2221 /* The OOM killer will not help higher order allocs */ 2221 /* The OOM killer will not help higher order allocs */
2222 if (order > PAGE_ALLOC_COSTLY_ORDER) 2222 if (order > PAGE_ALLOC_COSTLY_ORDER)
2223 goto out; 2223 goto out;
2224 /* The OOM killer does not needlessly kill tasks for lowmem */ 2224 /* The OOM killer does not needlessly kill tasks for lowmem */
2225 if (high_zoneidx < ZONE_NORMAL) 2225 if (high_zoneidx < ZONE_NORMAL)
2226 goto out; 2226 goto out;
2227 /* 2227 /*
2228 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2228 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2229 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2229 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2230 * The caller should handle page allocation failure by itself if 2230 * The caller should handle page allocation failure by itself if
2231 * it specifies __GFP_THISNODE. 2231 * it specifies __GFP_THISNODE.
2232 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2232 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2233 */ 2233 */
2234 if (gfp_mask & __GFP_THISNODE) 2234 if (gfp_mask & __GFP_THISNODE)
2235 goto out; 2235 goto out;
2236 } 2236 }
2237 /* Exhausted what can be done so it's blamo time */ 2237 /* Exhausted what can be done so it's blamo time */
2238 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2238 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2239 2239
2240 out: 2240 out:
2241 clear_zonelist_oom(zonelist, gfp_mask); 2241 clear_zonelist_oom(zonelist, gfp_mask);
2242 return page; 2242 return page;
2243 } 2243 }
2244 2244
2245 #ifdef CONFIG_COMPACTION 2245 #ifdef CONFIG_COMPACTION
2246 /* Try memory compaction for high-order allocations before reclaim */ 2246 /* Try memory compaction for high-order allocations before reclaim */
2247 static struct page * 2247 static struct page *
2248 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2248 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2249 struct zonelist *zonelist, enum zone_type high_zoneidx, 2249 struct zonelist *zonelist, enum zone_type high_zoneidx,
2250 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2250 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2251 int classzone_idx, int migratetype, enum migrate_mode mode, 2251 int classzone_idx, int migratetype, enum migrate_mode mode,
2252 bool *contended_compaction, bool *deferred_compaction, 2252 bool *contended_compaction, bool *deferred_compaction,
2253 unsigned long *did_some_progress) 2253 unsigned long *did_some_progress)
2254 { 2254 {
2255 if (!order) 2255 if (!order)
2256 return NULL; 2256 return NULL;
2257 2257
2258 if (compaction_deferred(preferred_zone, order)) { 2258 if (compaction_deferred(preferred_zone, order)) {
2259 *deferred_compaction = true; 2259 *deferred_compaction = true;
2260 return NULL; 2260 return NULL;
2261 } 2261 }
2262 2262
2263 current->flags |= PF_MEMALLOC; 2263 current->flags |= PF_MEMALLOC;
2264 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2264 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2265 nodemask, mode, 2265 nodemask, mode,
2266 contended_compaction); 2266 contended_compaction);
2267 current->flags &= ~PF_MEMALLOC; 2267 current->flags &= ~PF_MEMALLOC;
2268 2268
2269 if (*did_some_progress != COMPACT_SKIPPED) { 2269 if (*did_some_progress != COMPACT_SKIPPED) {
2270 struct page *page; 2270 struct page *page;
2271 2271
2272 /* Page migration frees to the PCP lists but we want merging */ 2272 /* Page migration frees to the PCP lists but we want merging */
2273 drain_pages(get_cpu()); 2273 drain_pages(get_cpu());
2274 put_cpu(); 2274 put_cpu();
2275 2275
2276 page = get_page_from_freelist(gfp_mask, nodemask, 2276 page = get_page_from_freelist(gfp_mask, nodemask,
2277 order, zonelist, high_zoneidx, 2277 order, zonelist, high_zoneidx,
2278 alloc_flags & ~ALLOC_NO_WATERMARKS, 2278 alloc_flags & ~ALLOC_NO_WATERMARKS,
2279 preferred_zone, classzone_idx, migratetype); 2279 preferred_zone, classzone_idx, migratetype);
2280 if (page) { 2280 if (page) {
2281 preferred_zone->compact_blockskip_flush = false; 2281 preferred_zone->compact_blockskip_flush = false;
2282 compaction_defer_reset(preferred_zone, order, true); 2282 compaction_defer_reset(preferred_zone, order, true);
2283 count_vm_event(COMPACTSUCCESS); 2283 count_vm_event(COMPACTSUCCESS);
2284 return page; 2284 return page;
2285 } 2285 }
2286 2286
2287 /* 2287 /*
2288 * It's bad if compaction run occurs and fails. 2288 * It's bad if compaction run occurs and fails.
2289 * The most likely reason is that pages exist, 2289 * The most likely reason is that pages exist,
2290 * but not enough to satisfy watermarks. 2290 * but not enough to satisfy watermarks.
2291 */ 2291 */
2292 count_vm_event(COMPACTFAIL); 2292 count_vm_event(COMPACTFAIL);
2293 2293
2294 /* 2294 /*
2295 * As async compaction considers a subset of pageblocks, only 2295 * As async compaction considers a subset of pageblocks, only
2296 * defer if the failure was a sync compaction failure. 2296 * defer if the failure was a sync compaction failure.
2297 */ 2297 */
2298 if (mode != MIGRATE_ASYNC) 2298 if (mode != MIGRATE_ASYNC)
2299 defer_compaction(preferred_zone, order); 2299 defer_compaction(preferred_zone, order);
2300 2300
2301 cond_resched(); 2301 cond_resched();
2302 } 2302 }
2303 2303
2304 return NULL; 2304 return NULL;
2305 } 2305 }
2306 #else 2306 #else
2307 static inline struct page * 2307 static inline struct page *
2308 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2308 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2309 struct zonelist *zonelist, enum zone_type high_zoneidx, 2309 struct zonelist *zonelist, enum zone_type high_zoneidx,
2310 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2310 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2311 int classzone_idx, int migratetype, 2311 int classzone_idx, int migratetype,
2312 enum migrate_mode mode, bool *contended_compaction, 2312 enum migrate_mode mode, bool *contended_compaction,
2313 bool *deferred_compaction, unsigned long *did_some_progress) 2313 bool *deferred_compaction, unsigned long *did_some_progress)
2314 { 2314 {
2315 return NULL; 2315 return NULL;
2316 } 2316 }
2317 #endif /* CONFIG_COMPACTION */ 2317 #endif /* CONFIG_COMPACTION */
2318 2318
2319 /* Perform direct synchronous page reclaim */ 2319 /* Perform direct synchronous page reclaim */
2320 static int 2320 static int
2321 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2321 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2322 nodemask_t *nodemask) 2322 nodemask_t *nodemask)
2323 { 2323 {
2324 struct reclaim_state reclaim_state; 2324 struct reclaim_state reclaim_state;
2325 int progress; 2325 int progress;
2326 2326
2327 cond_resched(); 2327 cond_resched();
2328 2328
2329 /* We now go into synchronous reclaim */ 2329 /* We now go into synchronous reclaim */
2330 cpuset_memory_pressure_bump(); 2330 cpuset_memory_pressure_bump();
2331 current->flags |= PF_MEMALLOC; 2331 current->flags |= PF_MEMALLOC;
2332 lockdep_set_current_reclaim_state(gfp_mask); 2332 lockdep_set_current_reclaim_state(gfp_mask);
2333 reclaim_state.reclaimed_slab = 0; 2333 reclaim_state.reclaimed_slab = 0;
2334 current->reclaim_state = &reclaim_state; 2334 current->reclaim_state = &reclaim_state;
2335 2335
2336 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2336 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2337 2337
2338 current->reclaim_state = NULL; 2338 current->reclaim_state = NULL;
2339 lockdep_clear_current_reclaim_state(); 2339 lockdep_clear_current_reclaim_state();
2340 current->flags &= ~PF_MEMALLOC; 2340 current->flags &= ~PF_MEMALLOC;
2341 2341
2342 cond_resched(); 2342 cond_resched();
2343 2343
2344 return progress; 2344 return progress;
2345 } 2345 }
2346 2346
2347 /* The really slow allocator path where we enter direct reclaim */ 2347 /* The really slow allocator path where we enter direct reclaim */
2348 static inline struct page * 2348 static inline struct page *
2349 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2349 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2350 struct zonelist *zonelist, enum zone_type high_zoneidx, 2350 struct zonelist *zonelist, enum zone_type high_zoneidx,
2351 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2351 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2352 int classzone_idx, int migratetype, unsigned long *did_some_progress) 2352 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2353 { 2353 {
2354 struct page *page = NULL; 2354 struct page *page = NULL;
2355 bool drained = false; 2355 bool drained = false;
2356 2356
2357 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2357 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2358 nodemask); 2358 nodemask);
2359 if (unlikely(!(*did_some_progress))) 2359 if (unlikely(!(*did_some_progress)))
2360 return NULL; 2360 return NULL;
2361 2361
2362 /* After successful reclaim, reconsider all zones for allocation */ 2362 /* After successful reclaim, reconsider all zones for allocation */
2363 if (IS_ENABLED(CONFIG_NUMA)) 2363 if (IS_ENABLED(CONFIG_NUMA))
2364 zlc_clear_zones_full(zonelist); 2364 zlc_clear_zones_full(zonelist);
2365 2365
2366 retry: 2366 retry:
2367 page = get_page_from_freelist(gfp_mask, nodemask, order, 2367 page = get_page_from_freelist(gfp_mask, nodemask, order,
2368 zonelist, high_zoneidx, 2368 zonelist, high_zoneidx,
2369 alloc_flags & ~ALLOC_NO_WATERMARKS, 2369 alloc_flags & ~ALLOC_NO_WATERMARKS,
2370 preferred_zone, classzone_idx, 2370 preferred_zone, classzone_idx,
2371 migratetype); 2371 migratetype);
2372 2372
2373 /* 2373 /*
2374 * If an allocation failed after direct reclaim, it could be because 2374 * If an allocation failed after direct reclaim, it could be because
2375 * pages are pinned on the per-cpu lists. Drain them and try again 2375 * pages are pinned on the per-cpu lists. Drain them and try again
2376 */ 2376 */
2377 if (!page && !drained) { 2377 if (!page && !drained) {
2378 drain_all_pages(); 2378 drain_all_pages();
2379 drained = true; 2379 drained = true;
2380 goto retry; 2380 goto retry;
2381 } 2381 }
2382 2382
2383 return page; 2383 return page;
2384 } 2384 }
2385 2385
2386 /* 2386 /*
2387 * This is called in the allocator slow-path if the allocation request is of 2387 * This is called in the allocator slow-path if the allocation request is of
2388 * sufficient urgency to ignore watermarks and take other desperate measures 2388 * sufficient urgency to ignore watermarks and take other desperate measures
2389 */ 2389 */
2390 static inline struct page * 2390 static inline struct page *
2391 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2391 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2392 struct zonelist *zonelist, enum zone_type high_zoneidx, 2392 struct zonelist *zonelist, enum zone_type high_zoneidx,
2393 nodemask_t *nodemask, struct zone *preferred_zone, 2393 nodemask_t *nodemask, struct zone *preferred_zone,
2394 int classzone_idx, int migratetype) 2394 int classzone_idx, int migratetype)
2395 { 2395 {
2396 struct page *page; 2396 struct page *page;
2397 2397
2398 do { 2398 do {
2399 page = get_page_from_freelist(gfp_mask, nodemask, order, 2399 page = get_page_from_freelist(gfp_mask, nodemask, order,
2400 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2400 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2401 preferred_zone, classzone_idx, migratetype); 2401 preferred_zone, classzone_idx, migratetype);
2402 2402
2403 if (!page && gfp_mask & __GFP_NOFAIL) 2403 if (!page && gfp_mask & __GFP_NOFAIL)
2404 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2404 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2405 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2405 } while (!page && (gfp_mask & __GFP_NOFAIL));
2406 2406
2407 return page; 2407 return page;
2408 } 2408 }
2409 2409
2410 static void reset_alloc_batches(struct zonelist *zonelist, 2410 static void reset_alloc_batches(struct zonelist *zonelist,
2411 enum zone_type high_zoneidx, 2411 enum zone_type high_zoneidx,
2412 struct zone *preferred_zone) 2412 struct zone *preferred_zone)
2413 { 2413 {
2414 struct zoneref *z; 2414 struct zoneref *z;
2415 struct zone *zone; 2415 struct zone *zone;
2416 2416
2417 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2417 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2418 /* 2418 /*
2419 * Only reset the batches of zones that were actually 2419 * Only reset the batches of zones that were actually
2420 * considered in the fairness pass, we don't want to 2420 * considered in the fairness pass, we don't want to
2421 * trash fairness information for zones that are not 2421 * trash fairness information for zones that are not
2422 * actually part of this zonelist's round-robin cycle. 2422 * actually part of this zonelist's round-robin cycle.
2423 */ 2423 */
2424 if (!zone_local(preferred_zone, zone)) 2424 if (!zone_local(preferred_zone, zone))
2425 continue; 2425 continue;
2426 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2426 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2427 high_wmark_pages(zone) - low_wmark_pages(zone) - 2427 high_wmark_pages(zone) - low_wmark_pages(zone) -
2428 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2428 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2429 } 2429 }
2430 } 2430 }
2431 2431
2432 static void wake_all_kswapds(unsigned int order, 2432 static void wake_all_kswapds(unsigned int order,
2433 struct zonelist *zonelist, 2433 struct zonelist *zonelist,
2434 enum zone_type high_zoneidx, 2434 enum zone_type high_zoneidx,
2435 struct zone *preferred_zone) 2435 struct zone *preferred_zone)
2436 { 2436 {
2437 struct zoneref *z; 2437 struct zoneref *z;
2438 struct zone *zone; 2438 struct zone *zone;
2439 2439
2440 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2440 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2441 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2441 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2442 } 2442 }
2443 2443
2444 static inline int 2444 static inline int
2445 gfp_to_alloc_flags(gfp_t gfp_mask) 2445 gfp_to_alloc_flags(gfp_t gfp_mask)
2446 { 2446 {
2447 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2447 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2448 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); 2448 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
2449 2449
2450 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2450 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2451 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2451 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2452 2452
2453 /* 2453 /*
2454 * The caller may dip into page reserves a bit more if the caller 2454 * The caller may dip into page reserves a bit more if the caller
2455 * cannot run direct reclaim, or if the caller has realtime scheduling 2455 * cannot run direct reclaim, or if the caller has realtime scheduling
2456 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2456 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2457 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). 2457 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
2458 */ 2458 */
2459 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2459 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2460 2460
2461 if (atomic) { 2461 if (atomic) {
2462 /* 2462 /*
2463 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2463 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2464 * if it can't schedule. 2464 * if it can't schedule.
2465 */ 2465 */
2466 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2466 if (!(gfp_mask & __GFP_NOMEMALLOC))
2467 alloc_flags |= ALLOC_HARDER; 2467 alloc_flags |= ALLOC_HARDER;
2468 /* 2468 /*
2469 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2469 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2470 * comment for __cpuset_node_allowed_softwall(). 2470 * comment for __cpuset_node_allowed_softwall().
2471 */ 2471 */
2472 alloc_flags &= ~ALLOC_CPUSET; 2472 alloc_flags &= ~ALLOC_CPUSET;
2473 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2473 } else if (unlikely(rt_task(current)) && !in_interrupt())
2474 alloc_flags |= ALLOC_HARDER; 2474 alloc_flags |= ALLOC_HARDER;
2475 2475
2476 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2476 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2477 if (gfp_mask & __GFP_MEMALLOC) 2477 if (gfp_mask & __GFP_MEMALLOC)
2478 alloc_flags |= ALLOC_NO_WATERMARKS; 2478 alloc_flags |= ALLOC_NO_WATERMARKS;
2479 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2479 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2480 alloc_flags |= ALLOC_NO_WATERMARKS; 2480 alloc_flags |= ALLOC_NO_WATERMARKS;
2481 else if (!in_interrupt() && 2481 else if (!in_interrupt() &&
2482 ((current->flags & PF_MEMALLOC) || 2482 ((current->flags & PF_MEMALLOC) ||
2483 unlikely(test_thread_flag(TIF_MEMDIE)))) 2483 unlikely(test_thread_flag(TIF_MEMDIE))))
2484 alloc_flags |= ALLOC_NO_WATERMARKS; 2484 alloc_flags |= ALLOC_NO_WATERMARKS;
2485 } 2485 }
2486 #ifdef CONFIG_CMA 2486 #ifdef CONFIG_CMA
2487 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2487 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2488 alloc_flags |= ALLOC_CMA; 2488 alloc_flags |= ALLOC_CMA;
2489 #endif 2489 #endif
2490 return alloc_flags; 2490 return alloc_flags;
2491 } 2491 }
2492 2492
2493 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2493 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2494 { 2494 {
2495 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2495 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2496 } 2496 }
2497 2497
2498 static inline struct page * 2498 static inline struct page *
2499 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2499 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2500 struct zonelist *zonelist, enum zone_type high_zoneidx, 2500 struct zonelist *zonelist, enum zone_type high_zoneidx,
2501 nodemask_t *nodemask, struct zone *preferred_zone, 2501 nodemask_t *nodemask, struct zone *preferred_zone,
2502 int classzone_idx, int migratetype) 2502 int classzone_idx, int migratetype)
2503 { 2503 {
2504 const gfp_t wait = gfp_mask & __GFP_WAIT; 2504 const gfp_t wait = gfp_mask & __GFP_WAIT;
2505 struct page *page = NULL; 2505 struct page *page = NULL;
2506 int alloc_flags; 2506 int alloc_flags;
2507 unsigned long pages_reclaimed = 0; 2507 unsigned long pages_reclaimed = 0;
2508 unsigned long did_some_progress; 2508 unsigned long did_some_progress;
2509 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2509 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2510 bool deferred_compaction = false; 2510 bool deferred_compaction = false;
2511 bool contended_compaction = false; 2511 bool contended_compaction = false;
2512 2512
2513 /* 2513 /*
2514 * In the slowpath, we sanity check order to avoid ever trying to 2514 * In the slowpath, we sanity check order to avoid ever trying to
2515 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2515 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2516 * be using allocators in order of preference for an area that is 2516 * be using allocators in order of preference for an area that is
2517 * too large. 2517 * too large.
2518 */ 2518 */
2519 if (order >= MAX_ORDER) { 2519 if (order >= MAX_ORDER) {
2520 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2520 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2521 return NULL; 2521 return NULL;
2522 } 2522 }
2523 2523
2524 /* 2524 /*
2525 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2525 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2526 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2526 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2527 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2527 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2528 * using a larger set of nodes after it has established that the 2528 * using a larger set of nodes after it has established that the
2529 * allowed per node queues are empty and that nodes are 2529 * allowed per node queues are empty and that nodes are
2530 * over allocated. 2530 * over allocated.
2531 */ 2531 */
2532 if (IS_ENABLED(CONFIG_NUMA) && 2532 if (IS_ENABLED(CONFIG_NUMA) &&
2533 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2533 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2534 goto nopage; 2534 goto nopage;
2535 2535
2536 restart: 2536 restart:
2537 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2537 if (!(gfp_mask & __GFP_NO_KSWAPD))
2538 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2538 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2539 2539
2540 /* 2540 /*
2541 * OK, we're below the kswapd watermark and have kicked background 2541 * OK, we're below the kswapd watermark and have kicked background
2542 * reclaim. Now things get more complex, so set up alloc_flags according 2542 * reclaim. Now things get more complex, so set up alloc_flags according
2543 * to how we want to proceed. 2543 * to how we want to proceed.
2544 */ 2544 */
2545 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2545 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2546 2546
2547 /* 2547 /*
2548 * Find the true preferred zone if the allocation is unconstrained by 2548 * Find the true preferred zone if the allocation is unconstrained by
2549 * cpusets. 2549 * cpusets.
2550 */ 2550 */
2551 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2551 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2552 struct zoneref *preferred_zoneref; 2552 struct zoneref *preferred_zoneref;
2553 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2553 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2554 NULL, 2554 NULL,
2555 &preferred_zone); 2555 &preferred_zone);
2556 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2556 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2557 } 2557 }
2558 2558
2559 rebalance: 2559 rebalance:
2560 /* This is the last chance, in general, before the goto nopage. */ 2560 /* This is the last chance, in general, before the goto nopage. */
2561 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2561 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2562 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2562 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2563 preferred_zone, classzone_idx, migratetype); 2563 preferred_zone, classzone_idx, migratetype);
2564 if (page) 2564 if (page)
2565 goto got_pg; 2565 goto got_pg;
2566 2566
2567 /* Allocate without watermarks if the context allows */ 2567 /* Allocate without watermarks if the context allows */
2568 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2568 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2569 /* 2569 /*
2570 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2570 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2571 * the allocation is high priority and these type of 2571 * the allocation is high priority and these type of
2572 * allocations are system rather than user orientated 2572 * allocations are system rather than user orientated
2573 */ 2573 */
2574 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2574 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2575 2575
2576 page = __alloc_pages_high_priority(gfp_mask, order, 2576 page = __alloc_pages_high_priority(gfp_mask, order,
2577 zonelist, high_zoneidx, nodemask, 2577 zonelist, high_zoneidx, nodemask,
2578 preferred_zone, classzone_idx, migratetype); 2578 preferred_zone, classzone_idx, migratetype);
2579 if (page) { 2579 if (page) {
2580 goto got_pg; 2580 goto got_pg;
2581 } 2581 }
2582 } 2582 }
2583 2583
2584 /* Atomic allocations - we can't balance anything */ 2584 /* Atomic allocations - we can't balance anything */
2585 if (!wait) 2585 if (!wait)
2586 goto nopage; 2586 goto nopage;
2587 2587
2588 /* Avoid recursion of direct reclaim */ 2588 /* Avoid recursion of direct reclaim */
2589 if (current->flags & PF_MEMALLOC) 2589 if (current->flags & PF_MEMALLOC)
2590 goto nopage; 2590 goto nopage;
2591 2591
2592 /* Avoid allocations with no watermarks from looping endlessly */ 2592 /* Avoid allocations with no watermarks from looping endlessly */
2593 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2593 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2594 goto nopage; 2594 goto nopage;
2595 2595
2596 /* 2596 /*
2597 * Try direct compaction. The first pass is asynchronous. Subsequent 2597 * Try direct compaction. The first pass is asynchronous. Subsequent
2598 * attempts after direct reclaim are synchronous 2598 * attempts after direct reclaim are synchronous
2599 */ 2599 */
2600 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2600 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2601 high_zoneidx, nodemask, alloc_flags, 2601 high_zoneidx, nodemask, alloc_flags,
2602 preferred_zone, 2602 preferred_zone,
2603 classzone_idx, migratetype, 2603 classzone_idx, migratetype,
2604 migration_mode, &contended_compaction, 2604 migration_mode, &contended_compaction,
2605 &deferred_compaction, 2605 &deferred_compaction,
2606 &did_some_progress); 2606 &did_some_progress);
2607 if (page) 2607 if (page)
2608 goto got_pg; 2608 goto got_pg;
2609 migration_mode = MIGRATE_SYNC_LIGHT; 2609 migration_mode = MIGRATE_SYNC_LIGHT;
2610 2610
2611 /* 2611 /*
2612 * If compaction is deferred for high-order allocations, it is because 2612 * If compaction is deferred for high-order allocations, it is because
2613 * sync compaction recently failed. In this is the case and the caller 2613 * sync compaction recently failed. In this is the case and the caller
2614 * requested a movable allocation that does not heavily disrupt the 2614 * requested a movable allocation that does not heavily disrupt the
2615 * system then fail the allocation instead of entering direct reclaim. 2615 * system then fail the allocation instead of entering direct reclaim.
2616 */ 2616 */
2617 if ((deferred_compaction || contended_compaction) && 2617 if ((deferred_compaction || contended_compaction) &&
2618 (gfp_mask & __GFP_NO_KSWAPD)) 2618 (gfp_mask & __GFP_NO_KSWAPD))
2619 goto nopage; 2619 goto nopage;
2620 2620
2621 /* Try direct reclaim and then allocating */ 2621 /* Try direct reclaim and then allocating */
2622 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2622 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2623 zonelist, high_zoneidx, 2623 zonelist, high_zoneidx,
2624 nodemask, 2624 nodemask,
2625 alloc_flags, preferred_zone, 2625 alloc_flags, preferred_zone,
2626 classzone_idx, migratetype, 2626 classzone_idx, migratetype,
2627 &did_some_progress); 2627 &did_some_progress);
2628 if (page) 2628 if (page)
2629 goto got_pg; 2629 goto got_pg;
2630 2630
2631 /* 2631 /*
2632 * If we failed to make any progress reclaiming, then we are 2632 * If we failed to make any progress reclaiming, then we are
2633 * running out of options and have to consider going OOM 2633 * running out of options and have to consider going OOM
2634 */ 2634 */
2635 if (!did_some_progress) { 2635 if (!did_some_progress) {
2636 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2636 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2637 if (oom_killer_disabled) 2637 if (oom_killer_disabled)
2638 goto nopage; 2638 goto nopage;
2639 /* Coredumps can quickly deplete all memory reserves */ 2639 /* Coredumps can quickly deplete all memory reserves */
2640 if ((current->flags & PF_DUMPCORE) && 2640 if ((current->flags & PF_DUMPCORE) &&
2641 !(gfp_mask & __GFP_NOFAIL)) 2641 !(gfp_mask & __GFP_NOFAIL))
2642 goto nopage; 2642 goto nopage;
2643 page = __alloc_pages_may_oom(gfp_mask, order, 2643 page = __alloc_pages_may_oom(gfp_mask, order,
2644 zonelist, high_zoneidx, 2644 zonelist, high_zoneidx,
2645 nodemask, preferred_zone, 2645 nodemask, preferred_zone,
2646 classzone_idx, migratetype); 2646 classzone_idx, migratetype);
2647 if (page) 2647 if (page)
2648 goto got_pg; 2648 goto got_pg;
2649 2649
2650 if (!(gfp_mask & __GFP_NOFAIL)) { 2650 if (!(gfp_mask & __GFP_NOFAIL)) {
2651 /* 2651 /*
2652 * The oom killer is not called for high-order 2652 * The oom killer is not called for high-order
2653 * allocations that may fail, so if no progress 2653 * allocations that may fail, so if no progress
2654 * is being made, there are no other options and 2654 * is being made, there are no other options and
2655 * retrying is unlikely to help. 2655 * retrying is unlikely to help.
2656 */ 2656 */
2657 if (order > PAGE_ALLOC_COSTLY_ORDER) 2657 if (order > PAGE_ALLOC_COSTLY_ORDER)
2658 goto nopage; 2658 goto nopage;
2659 /* 2659 /*
2660 * The oom killer is not called for lowmem 2660 * The oom killer is not called for lowmem
2661 * allocations to prevent needlessly killing 2661 * allocations to prevent needlessly killing
2662 * innocent tasks. 2662 * innocent tasks.
2663 */ 2663 */
2664 if (high_zoneidx < ZONE_NORMAL) 2664 if (high_zoneidx < ZONE_NORMAL)
2665 goto nopage; 2665 goto nopage;
2666 } 2666 }
2667 2667
2668 goto restart; 2668 goto restart;
2669 } 2669 }
2670 } 2670 }
2671 2671
2672 /* Check if we should retry the allocation */ 2672 /* Check if we should retry the allocation */
2673 pages_reclaimed += did_some_progress; 2673 pages_reclaimed += did_some_progress;
2674 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2674 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2675 pages_reclaimed)) { 2675 pages_reclaimed)) {
2676 /* Wait for some write requests to complete then retry */ 2676 /* Wait for some write requests to complete then retry */
2677 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2677 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2678 goto rebalance; 2678 goto rebalance;
2679 } else { 2679 } else {
2680 /* 2680 /*
2681 * High-order allocations do not necessarily loop after 2681 * High-order allocations do not necessarily loop after
2682 * direct reclaim and reclaim/compaction depends on compaction 2682 * direct reclaim and reclaim/compaction depends on compaction
2683 * being called after reclaim so call directly if necessary 2683 * being called after reclaim so call directly if necessary
2684 */ 2684 */
2685 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2685 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2686 high_zoneidx, nodemask, alloc_flags, 2686 high_zoneidx, nodemask, alloc_flags,
2687 preferred_zone, 2687 preferred_zone,
2688 classzone_idx, migratetype, 2688 classzone_idx, migratetype,
2689 migration_mode, &contended_compaction, 2689 migration_mode, &contended_compaction,
2690 &deferred_compaction, 2690 &deferred_compaction,
2691 &did_some_progress); 2691 &did_some_progress);
2692 if (page) 2692 if (page)
2693 goto got_pg; 2693 goto got_pg;
2694 } 2694 }
2695 2695
2696 nopage: 2696 nopage:
2697 warn_alloc_failed(gfp_mask, order, NULL); 2697 warn_alloc_failed(gfp_mask, order, NULL);
2698 return page; 2698 return page;
2699 got_pg: 2699 got_pg:
2700 if (kmemcheck_enabled) 2700 if (kmemcheck_enabled)
2701 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2701 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2702 2702
2703 return page; 2703 return page;
2704 } 2704 }
2705 2705
2706 /* 2706 /*
2707 * This is the 'heart' of the zoned buddy allocator. 2707 * This is the 'heart' of the zoned buddy allocator.
2708 */ 2708 */
2709 struct page * 2709 struct page *
2710 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2710 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2711 struct zonelist *zonelist, nodemask_t *nodemask) 2711 struct zonelist *zonelist, nodemask_t *nodemask)
2712 { 2712 {
2713 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2713 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2714 struct zone *preferred_zone; 2714 struct zone *preferred_zone;
2715 struct zoneref *preferred_zoneref; 2715 struct zoneref *preferred_zoneref;
2716 struct page *page = NULL; 2716 struct page *page = NULL;
2717 int migratetype = allocflags_to_migratetype(gfp_mask); 2717 int migratetype = allocflags_to_migratetype(gfp_mask);
2718 unsigned int cpuset_mems_cookie; 2718 unsigned int cpuset_mems_cookie;
2719 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2719 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2720 struct mem_cgroup *memcg = NULL; 2720 struct mem_cgroup *memcg = NULL;
2721 int classzone_idx; 2721 int classzone_idx;
2722 2722
2723 gfp_mask &= gfp_allowed_mask; 2723 gfp_mask &= gfp_allowed_mask;
2724 2724
2725 lockdep_trace_alloc(gfp_mask); 2725 lockdep_trace_alloc(gfp_mask);
2726 2726
2727 might_sleep_if(gfp_mask & __GFP_WAIT); 2727 might_sleep_if(gfp_mask & __GFP_WAIT);
2728 2728
2729 if (should_fail_alloc_page(gfp_mask, order)) 2729 if (should_fail_alloc_page(gfp_mask, order))
2730 return NULL; 2730 return NULL;
2731 2731
2732 /* 2732 /*
2733 * Check the zones suitable for the gfp_mask contain at least one 2733 * Check the zones suitable for the gfp_mask contain at least one
2734 * valid zone. It's possible to have an empty zonelist as a result 2734 * valid zone. It's possible to have an empty zonelist as a result
2735 * of GFP_THISNODE and a memoryless node 2735 * of GFP_THISNODE and a memoryless node
2736 */ 2736 */
2737 if (unlikely(!zonelist->_zonerefs->zone)) 2737 if (unlikely(!zonelist->_zonerefs->zone))
2738 return NULL; 2738 return NULL;
2739 2739
2740 /* 2740 /*
2741 * Will only have any effect when __GFP_KMEMCG is set. This is 2741 * Will only have any effect when __GFP_KMEMCG is set. This is
2742 * verified in the (always inline) callee 2742 * verified in the (always inline) callee
2743 */ 2743 */
2744 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2744 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2745 return NULL; 2745 return NULL;
2746 2746
2747 retry_cpuset: 2747 retry_cpuset:
2748 cpuset_mems_cookie = read_mems_allowed_begin(); 2748 cpuset_mems_cookie = read_mems_allowed_begin();
2749 2749
2750 /* The preferred zone is used for statistics later */ 2750 /* The preferred zone is used for statistics later */
2751 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2751 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2752 nodemask ? : &cpuset_current_mems_allowed, 2752 nodemask ? : &cpuset_current_mems_allowed,
2753 &preferred_zone); 2753 &preferred_zone);
2754 if (!preferred_zone) 2754 if (!preferred_zone)
2755 goto out; 2755 goto out;
2756 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2756 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2757 2757
2758 #ifdef CONFIG_CMA 2758 #ifdef CONFIG_CMA
2759 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2759 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2760 alloc_flags |= ALLOC_CMA; 2760 alloc_flags |= ALLOC_CMA;
2761 #endif 2761 #endif
2762 retry: 2762 retry:
2763 /* First allocation attempt */ 2763 /* First allocation attempt */
2764 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2764 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2765 zonelist, high_zoneidx, alloc_flags, 2765 zonelist, high_zoneidx, alloc_flags,
2766 preferred_zone, classzone_idx, migratetype); 2766 preferred_zone, classzone_idx, migratetype);
2767 if (unlikely(!page)) { 2767 if (unlikely(!page)) {
2768 /* 2768 /*
2769 * The first pass makes sure allocations are spread 2769 * The first pass makes sure allocations are spread
2770 * fairly within the local node. However, the local 2770 * fairly within the local node. However, the local
2771 * node might have free pages left after the fairness 2771 * node might have free pages left after the fairness
2772 * batches are exhausted, and remote zones haven't 2772 * batches are exhausted, and remote zones haven't
2773 * even been considered yet. Try once more without 2773 * even been considered yet. Try once more without
2774 * fairness, and include remote zones now, before 2774 * fairness, and include remote zones now, before
2775 * entering the slowpath and waking kswapd: prefer 2775 * entering the slowpath and waking kswapd: prefer
2776 * spilling to a remote zone over swapping locally. 2776 * spilling to a remote zone over swapping locally.
2777 */ 2777 */
2778 if (alloc_flags & ALLOC_FAIR) { 2778 if (alloc_flags & ALLOC_FAIR) {
2779 reset_alloc_batches(zonelist, high_zoneidx, 2779 reset_alloc_batches(zonelist, high_zoneidx,
2780 preferred_zone); 2780 preferred_zone);
2781 alloc_flags &= ~ALLOC_FAIR; 2781 alloc_flags &= ~ALLOC_FAIR;
2782 goto retry; 2782 goto retry;
2783 } 2783 }
2784 /* 2784 /*
2785 * Runtime PM, block IO and its error handling path 2785 * Runtime PM, block IO and its error handling path
2786 * can deadlock because I/O on the device might not 2786 * can deadlock because I/O on the device might not
2787 * complete. 2787 * complete.
2788 */ 2788 */
2789 gfp_mask = memalloc_noio_flags(gfp_mask); 2789 gfp_mask = memalloc_noio_flags(gfp_mask);
2790 page = __alloc_pages_slowpath(gfp_mask, order, 2790 page = __alloc_pages_slowpath(gfp_mask, order,
2791 zonelist, high_zoneidx, nodemask, 2791 zonelist, high_zoneidx, nodemask,
2792 preferred_zone, classzone_idx, migratetype); 2792 preferred_zone, classzone_idx, migratetype);
2793 } 2793 }
2794 2794
2795 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2795 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2796 2796
2797 out: 2797 out:
2798 /* 2798 /*
2799 * When updating a task's mems_allowed, it is possible to race with 2799 * When updating a task's mems_allowed, it is possible to race with
2800 * parallel threads in such a way that an allocation can fail while 2800 * parallel threads in such a way that an allocation can fail while
2801 * the mask is being updated. If a page allocation is about to fail, 2801 * the mask is being updated. If a page allocation is about to fail,
2802 * check if the cpuset changed during allocation and if so, retry. 2802 * check if the cpuset changed during allocation and if so, retry.
2803 */ 2803 */
2804 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2804 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2805 goto retry_cpuset; 2805 goto retry_cpuset;
2806 2806
2807 memcg_kmem_commit_charge(page, memcg, order); 2807 memcg_kmem_commit_charge(page, memcg, order);
2808 2808
2809 return page; 2809 return page;
2810 } 2810 }
2811 EXPORT_SYMBOL(__alloc_pages_nodemask); 2811 EXPORT_SYMBOL(__alloc_pages_nodemask);
2812 2812
2813 /* 2813 /*
2814 * Common helper functions. 2814 * Common helper functions.
2815 */ 2815 */
2816 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2816 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2817 { 2817 {
2818 struct page *page; 2818 struct page *page;
2819 2819
2820 /* 2820 /*
2821 * __get_free_pages() returns a 32-bit address, which cannot represent 2821 * __get_free_pages() returns a 32-bit address, which cannot represent
2822 * a highmem page 2822 * a highmem page
2823 */ 2823 */
2824 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2824 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2825 2825
2826 page = alloc_pages(gfp_mask, order); 2826 page = alloc_pages(gfp_mask, order);
2827 if (!page) 2827 if (!page)
2828 return 0; 2828 return 0;
2829 return (unsigned long) page_address(page); 2829 return (unsigned long) page_address(page);
2830 } 2830 }
2831 EXPORT_SYMBOL(__get_free_pages); 2831 EXPORT_SYMBOL(__get_free_pages);
2832 2832
2833 unsigned long get_zeroed_page(gfp_t gfp_mask) 2833 unsigned long get_zeroed_page(gfp_t gfp_mask)
2834 { 2834 {
2835 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2835 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2836 } 2836 }
2837 EXPORT_SYMBOL(get_zeroed_page); 2837 EXPORT_SYMBOL(get_zeroed_page);
2838 2838
2839 void __free_pages(struct page *page, unsigned int order) 2839 void __free_pages(struct page *page, unsigned int order)
2840 { 2840 {
2841 if (put_page_testzero(page)) { 2841 if (put_page_testzero(page)) {
2842 if (order == 0) 2842 if (order == 0)
2843 free_hot_cold_page(page, false); 2843 free_hot_cold_page(page, false);
2844 else 2844 else
2845 __free_pages_ok(page, order); 2845 __free_pages_ok(page, order);
2846 } 2846 }
2847 } 2847 }
2848 2848
2849 EXPORT_SYMBOL(__free_pages); 2849 EXPORT_SYMBOL(__free_pages);
2850 2850
2851 void free_pages(unsigned long addr, unsigned int order) 2851 void free_pages(unsigned long addr, unsigned int order)
2852 { 2852 {
2853 if (addr != 0) { 2853 if (addr != 0) {
2854 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2854 VM_BUG_ON(!virt_addr_valid((void *)addr));
2855 __free_pages(virt_to_page((void *)addr), order); 2855 __free_pages(virt_to_page((void *)addr), order);
2856 } 2856 }
2857 } 2857 }
2858 2858
2859 EXPORT_SYMBOL(free_pages); 2859 EXPORT_SYMBOL(free_pages);
2860 2860
2861 /* 2861 /*
2862 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2862 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2863 * pages allocated with __GFP_KMEMCG. 2863 * pages allocated with __GFP_KMEMCG.
2864 * 2864 *
2865 * Those pages are accounted to a particular memcg, embedded in the 2865 * Those pages are accounted to a particular memcg, embedded in the
2866 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2866 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2867 * for that information only to find out that it is NULL for users who have no 2867 * for that information only to find out that it is NULL for users who have no
2868 * interest in that whatsoever, we provide these functions. 2868 * interest in that whatsoever, we provide these functions.
2869 * 2869 *
2870 * The caller knows better which flags it relies on. 2870 * The caller knows better which flags it relies on.
2871 */ 2871 */
2872 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2872 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2873 { 2873 {
2874 memcg_kmem_uncharge_pages(page, order); 2874 memcg_kmem_uncharge_pages(page, order);
2875 __free_pages(page, order); 2875 __free_pages(page, order);
2876 } 2876 }
2877 2877
2878 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2878 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2879 { 2879 {
2880 if (addr != 0) { 2880 if (addr != 0) {
2881 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2881 VM_BUG_ON(!virt_addr_valid((void *)addr));
2882 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2882 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2883 } 2883 }
2884 } 2884 }
2885 2885
2886 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2886 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2887 { 2887 {
2888 if (addr) { 2888 if (addr) {
2889 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2889 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2890 unsigned long used = addr + PAGE_ALIGN(size); 2890 unsigned long used = addr + PAGE_ALIGN(size);
2891 2891
2892 split_page(virt_to_page((void *)addr), order); 2892 split_page(virt_to_page((void *)addr), order);
2893 while (used < alloc_end) { 2893 while (used < alloc_end) {
2894 free_page(used); 2894 free_page(used);
2895 used += PAGE_SIZE; 2895 used += PAGE_SIZE;
2896 } 2896 }
2897 } 2897 }
2898 return (void *)addr; 2898 return (void *)addr;
2899 } 2899 }
2900 2900
2901 /** 2901 /**
2902 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2902 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2903 * @size: the number of bytes to allocate 2903 * @size: the number of bytes to allocate
2904 * @gfp_mask: GFP flags for the allocation 2904 * @gfp_mask: GFP flags for the allocation
2905 * 2905 *
2906 * This function is similar to alloc_pages(), except that it allocates the 2906 * This function is similar to alloc_pages(), except that it allocates the
2907 * minimum number of pages to satisfy the request. alloc_pages() can only 2907 * minimum number of pages to satisfy the request. alloc_pages() can only
2908 * allocate memory in power-of-two pages. 2908 * allocate memory in power-of-two pages.
2909 * 2909 *
2910 * This function is also limited by MAX_ORDER. 2910 * This function is also limited by MAX_ORDER.
2911 * 2911 *
2912 * Memory allocated by this function must be released by free_pages_exact(). 2912 * Memory allocated by this function must be released by free_pages_exact().
2913 */ 2913 */
2914 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2914 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2915 { 2915 {
2916 unsigned int order = get_order(size); 2916 unsigned int order = get_order(size);
2917 unsigned long addr; 2917 unsigned long addr;
2918 2918
2919 addr = __get_free_pages(gfp_mask, order); 2919 addr = __get_free_pages(gfp_mask, order);
2920 return make_alloc_exact(addr, order, size); 2920 return make_alloc_exact(addr, order, size);
2921 } 2921 }
2922 EXPORT_SYMBOL(alloc_pages_exact); 2922 EXPORT_SYMBOL(alloc_pages_exact);
2923 2923
2924 /** 2924 /**
2925 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2925 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2926 * pages on a node. 2926 * pages on a node.
2927 * @nid: the preferred node ID where memory should be allocated 2927 * @nid: the preferred node ID where memory should be allocated
2928 * @size: the number of bytes to allocate 2928 * @size: the number of bytes to allocate
2929 * @gfp_mask: GFP flags for the allocation 2929 * @gfp_mask: GFP flags for the allocation
2930 * 2930 *
2931 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2931 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2932 * back. 2932 * back.
2933 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2933 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2934 * but is not exact. 2934 * but is not exact.
2935 */ 2935 */
2936 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2936 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2937 { 2937 {
2938 unsigned order = get_order(size); 2938 unsigned order = get_order(size);
2939 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2939 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2940 if (!p) 2940 if (!p)
2941 return NULL; 2941 return NULL;
2942 return make_alloc_exact((unsigned long)page_address(p), order, size); 2942 return make_alloc_exact((unsigned long)page_address(p), order, size);
2943 } 2943 }
2944 EXPORT_SYMBOL(alloc_pages_exact_nid); 2944 EXPORT_SYMBOL(alloc_pages_exact_nid);
2945 2945
2946 /** 2946 /**
2947 * free_pages_exact - release memory allocated via alloc_pages_exact() 2947 * free_pages_exact - release memory allocated via alloc_pages_exact()
2948 * @virt: the value returned by alloc_pages_exact. 2948 * @virt: the value returned by alloc_pages_exact.
2949 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2949 * @size: size of allocation, same value as passed to alloc_pages_exact().
2950 * 2950 *
2951 * Release the memory allocated by a previous call to alloc_pages_exact. 2951 * Release the memory allocated by a previous call to alloc_pages_exact.
2952 */ 2952 */
2953 void free_pages_exact(void *virt, size_t size) 2953 void free_pages_exact(void *virt, size_t size)
2954 { 2954 {
2955 unsigned long addr = (unsigned long)virt; 2955 unsigned long addr = (unsigned long)virt;
2956 unsigned long end = addr + PAGE_ALIGN(size); 2956 unsigned long end = addr + PAGE_ALIGN(size);
2957 2957
2958 while (addr < end) { 2958 while (addr < end) {
2959 free_page(addr); 2959 free_page(addr);
2960 addr += PAGE_SIZE; 2960 addr += PAGE_SIZE;
2961 } 2961 }
2962 } 2962 }
2963 EXPORT_SYMBOL(free_pages_exact); 2963 EXPORT_SYMBOL(free_pages_exact);
2964 2964
2965 /** 2965 /**
2966 * nr_free_zone_pages - count number of pages beyond high watermark 2966 * nr_free_zone_pages - count number of pages beyond high watermark
2967 * @offset: The zone index of the highest zone 2967 * @offset: The zone index of the highest zone
2968 * 2968 *
2969 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2969 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2970 * high watermark within all zones at or below a given zone index. For each 2970 * high watermark within all zones at or below a given zone index. For each
2971 * zone, the number of pages is calculated as: 2971 * zone, the number of pages is calculated as:
2972 * managed_pages - high_pages 2972 * managed_pages - high_pages
2973 */ 2973 */
2974 static unsigned long nr_free_zone_pages(int offset) 2974 static unsigned long nr_free_zone_pages(int offset)
2975 { 2975 {
2976 struct zoneref *z; 2976 struct zoneref *z;
2977 struct zone *zone; 2977 struct zone *zone;
2978 2978
2979 /* Just pick one node, since fallback list is circular */ 2979 /* Just pick one node, since fallback list is circular */
2980 unsigned long sum = 0; 2980 unsigned long sum = 0;
2981 2981
2982 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2982 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2983 2983
2984 for_each_zone_zonelist(zone, z, zonelist, offset) { 2984 for_each_zone_zonelist(zone, z, zonelist, offset) {
2985 unsigned long size = zone->managed_pages; 2985 unsigned long size = zone->managed_pages;
2986 unsigned long high = high_wmark_pages(zone); 2986 unsigned long high = high_wmark_pages(zone);
2987 if (size > high) 2987 if (size > high)
2988 sum += size - high; 2988 sum += size - high;
2989 } 2989 }
2990 2990
2991 return sum; 2991 return sum;
2992 } 2992 }
2993 2993
2994 /** 2994 /**
2995 * nr_free_buffer_pages - count number of pages beyond high watermark 2995 * nr_free_buffer_pages - count number of pages beyond high watermark
2996 * 2996 *
2997 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2997 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2998 * watermark within ZONE_DMA and ZONE_NORMAL. 2998 * watermark within ZONE_DMA and ZONE_NORMAL.
2999 */ 2999 */
3000 unsigned long nr_free_buffer_pages(void) 3000 unsigned long nr_free_buffer_pages(void)
3001 { 3001 {
3002 return nr_free_zone_pages(gfp_zone(GFP_USER)); 3002 return nr_free_zone_pages(gfp_zone(GFP_USER));
3003 } 3003 }
3004 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 3004 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
3005 3005
3006 /** 3006 /**
3007 * nr_free_pagecache_pages - count number of pages beyond high watermark 3007 * nr_free_pagecache_pages - count number of pages beyond high watermark
3008 * 3008 *
3009 * nr_free_pagecache_pages() counts the number of pages which are beyond the 3009 * nr_free_pagecache_pages() counts the number of pages which are beyond the
3010 * high watermark within all zones. 3010 * high watermark within all zones.
3011 */ 3011 */
3012 unsigned long nr_free_pagecache_pages(void) 3012 unsigned long nr_free_pagecache_pages(void)
3013 { 3013 {
3014 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 3014 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
3015 } 3015 }
3016 3016
3017 static inline void show_node(struct zone *zone) 3017 static inline void show_node(struct zone *zone)
3018 { 3018 {
3019 if (IS_ENABLED(CONFIG_NUMA)) 3019 if (IS_ENABLED(CONFIG_NUMA))
3020 printk("Node %d ", zone_to_nid(zone)); 3020 printk("Node %d ", zone_to_nid(zone));
3021 } 3021 }
3022 3022
3023 void si_meminfo(struct sysinfo *val) 3023 void si_meminfo(struct sysinfo *val)
3024 { 3024 {
3025 val->totalram = totalram_pages; 3025 val->totalram = totalram_pages;
3026 val->sharedram = 0; 3026 val->sharedram = 0;
3027 val->freeram = global_page_state(NR_FREE_PAGES); 3027 val->freeram = global_page_state(NR_FREE_PAGES);
3028 val->bufferram = nr_blockdev_pages(); 3028 val->bufferram = nr_blockdev_pages();
3029 val->totalhigh = totalhigh_pages; 3029 val->totalhigh = totalhigh_pages;
3030 val->freehigh = nr_free_highpages(); 3030 val->freehigh = nr_free_highpages();
3031 val->mem_unit = PAGE_SIZE; 3031 val->mem_unit = PAGE_SIZE;
3032 } 3032 }
3033 3033
3034 EXPORT_SYMBOL(si_meminfo); 3034 EXPORT_SYMBOL(si_meminfo);
3035 3035
3036 #ifdef CONFIG_NUMA 3036 #ifdef CONFIG_NUMA
3037 void si_meminfo_node(struct sysinfo *val, int nid) 3037 void si_meminfo_node(struct sysinfo *val, int nid)
3038 { 3038 {
3039 int zone_type; /* needs to be signed */ 3039 int zone_type; /* needs to be signed */
3040 unsigned long managed_pages = 0; 3040 unsigned long managed_pages = 0;
3041 pg_data_t *pgdat = NODE_DATA(nid); 3041 pg_data_t *pgdat = NODE_DATA(nid);
3042 3042
3043 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3043 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3044 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3044 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3045 val->totalram = managed_pages; 3045 val->totalram = managed_pages;
3046 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3046 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3047 #ifdef CONFIG_HIGHMEM 3047 #ifdef CONFIG_HIGHMEM
3048 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3048 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
3049 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3049 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3050 NR_FREE_PAGES); 3050 NR_FREE_PAGES);
3051 #else 3051 #else
3052 val->totalhigh = 0; 3052 val->totalhigh = 0;
3053 val->freehigh = 0; 3053 val->freehigh = 0;
3054 #endif 3054 #endif
3055 val->mem_unit = PAGE_SIZE; 3055 val->mem_unit = PAGE_SIZE;
3056 } 3056 }
3057 #endif 3057 #endif
3058 3058
3059 /* 3059 /*
3060 * Determine whether the node should be displayed or not, depending on whether 3060 * Determine whether the node should be displayed or not, depending on whether
3061 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3061 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3062 */ 3062 */
3063 bool skip_free_areas_node(unsigned int flags, int nid) 3063 bool skip_free_areas_node(unsigned int flags, int nid)
3064 { 3064 {
3065 bool ret = false; 3065 bool ret = false;
3066 unsigned int cpuset_mems_cookie; 3066 unsigned int cpuset_mems_cookie;
3067 3067
3068 if (!(flags & SHOW_MEM_FILTER_NODES)) 3068 if (!(flags & SHOW_MEM_FILTER_NODES))
3069 goto out; 3069 goto out;
3070 3070
3071 do { 3071 do {
3072 cpuset_mems_cookie = read_mems_allowed_begin(); 3072 cpuset_mems_cookie = read_mems_allowed_begin();
3073 ret = !node_isset(nid, cpuset_current_mems_allowed); 3073 ret = !node_isset(nid, cpuset_current_mems_allowed);
3074 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3074 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3075 out: 3075 out:
3076 return ret; 3076 return ret;
3077 } 3077 }
3078 3078
3079 #define K(x) ((x) << (PAGE_SHIFT-10)) 3079 #define K(x) ((x) << (PAGE_SHIFT-10))
3080 3080
3081 static void show_migration_types(unsigned char type) 3081 static void show_migration_types(unsigned char type)
3082 { 3082 {
3083 static const char types[MIGRATE_TYPES] = { 3083 static const char types[MIGRATE_TYPES] = {
3084 [MIGRATE_UNMOVABLE] = 'U', 3084 [MIGRATE_UNMOVABLE] = 'U',
3085 [MIGRATE_RECLAIMABLE] = 'E', 3085 [MIGRATE_RECLAIMABLE] = 'E',
3086 [MIGRATE_MOVABLE] = 'M', 3086 [MIGRATE_MOVABLE] = 'M',
3087 [MIGRATE_RESERVE] = 'R', 3087 [MIGRATE_RESERVE] = 'R',
3088 #ifdef CONFIG_CMA 3088 #ifdef CONFIG_CMA
3089 [MIGRATE_CMA] = 'C', 3089 [MIGRATE_CMA] = 'C',
3090 #endif 3090 #endif
3091 #ifdef CONFIG_MEMORY_ISOLATION 3091 #ifdef CONFIG_MEMORY_ISOLATION
3092 [MIGRATE_ISOLATE] = 'I', 3092 [MIGRATE_ISOLATE] = 'I',
3093 #endif 3093 #endif
3094 }; 3094 };
3095 char tmp[MIGRATE_TYPES + 1]; 3095 char tmp[MIGRATE_TYPES + 1];
3096 char *p = tmp; 3096 char *p = tmp;
3097 int i; 3097 int i;
3098 3098
3099 for (i = 0; i < MIGRATE_TYPES; i++) { 3099 for (i = 0; i < MIGRATE_TYPES; i++) {
3100 if (type & (1 << i)) 3100 if (type & (1 << i))
3101 *p++ = types[i]; 3101 *p++ = types[i];
3102 } 3102 }
3103 3103
3104 *p = '\0'; 3104 *p = '\0';
3105 printk("(%s) ", tmp); 3105 printk("(%s) ", tmp);
3106 } 3106 }
3107 3107
3108 /* 3108 /*
3109 * Show free area list (used inside shift_scroll-lock stuff) 3109 * Show free area list (used inside shift_scroll-lock stuff)
3110 * We also calculate the percentage fragmentation. We do this by counting the 3110 * We also calculate the percentage fragmentation. We do this by counting the
3111 * memory on each free list with the exception of the first item on the list. 3111 * memory on each free list with the exception of the first item on the list.
3112 * Suppresses nodes that are not allowed by current's cpuset if 3112 * Suppresses nodes that are not allowed by current's cpuset if
3113 * SHOW_MEM_FILTER_NODES is passed. 3113 * SHOW_MEM_FILTER_NODES is passed.
3114 */ 3114 */
3115 void show_free_areas(unsigned int filter) 3115 void show_free_areas(unsigned int filter)
3116 { 3116 {
3117 int cpu; 3117 int cpu;
3118 struct zone *zone; 3118 struct zone *zone;
3119 3119
3120 for_each_populated_zone(zone) { 3120 for_each_populated_zone(zone) {
3121 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3121 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3122 continue; 3122 continue;
3123 show_node(zone); 3123 show_node(zone);
3124 printk("%s per-cpu:\n", zone->name); 3124 printk("%s per-cpu:\n", zone->name);
3125 3125
3126 for_each_online_cpu(cpu) { 3126 for_each_online_cpu(cpu) {
3127 struct per_cpu_pageset *pageset; 3127 struct per_cpu_pageset *pageset;
3128 3128
3129 pageset = per_cpu_ptr(zone->pageset, cpu); 3129 pageset = per_cpu_ptr(zone->pageset, cpu);
3130 3130
3131 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3131 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3132 cpu, pageset->pcp.high, 3132 cpu, pageset->pcp.high,
3133 pageset->pcp.batch, pageset->pcp.count); 3133 pageset->pcp.batch, pageset->pcp.count);
3134 } 3134 }
3135 } 3135 }
3136 3136
3137 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3137 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3138 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3138 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3139 " unevictable:%lu" 3139 " unevictable:%lu"
3140 " dirty:%lu writeback:%lu unstable:%lu\n" 3140 " dirty:%lu writeback:%lu unstable:%lu\n"
3141 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3141 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3142 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3142 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3143 " free_cma:%lu\n", 3143 " free_cma:%lu\n",
3144 global_page_state(NR_ACTIVE_ANON), 3144 global_page_state(NR_ACTIVE_ANON),
3145 global_page_state(NR_INACTIVE_ANON), 3145 global_page_state(NR_INACTIVE_ANON),
3146 global_page_state(NR_ISOLATED_ANON), 3146 global_page_state(NR_ISOLATED_ANON),
3147 global_page_state(NR_ACTIVE_FILE), 3147 global_page_state(NR_ACTIVE_FILE),
3148 global_page_state(NR_INACTIVE_FILE), 3148 global_page_state(NR_INACTIVE_FILE),
3149 global_page_state(NR_ISOLATED_FILE), 3149 global_page_state(NR_ISOLATED_FILE),
3150 global_page_state(NR_UNEVICTABLE), 3150 global_page_state(NR_UNEVICTABLE),
3151 global_page_state(NR_FILE_DIRTY), 3151 global_page_state(NR_FILE_DIRTY),
3152 global_page_state(NR_WRITEBACK), 3152 global_page_state(NR_WRITEBACK),
3153 global_page_state(NR_UNSTABLE_NFS), 3153 global_page_state(NR_UNSTABLE_NFS),
3154 global_page_state(NR_FREE_PAGES), 3154 global_page_state(NR_FREE_PAGES),
3155 global_page_state(NR_SLAB_RECLAIMABLE), 3155 global_page_state(NR_SLAB_RECLAIMABLE),
3156 global_page_state(NR_SLAB_UNRECLAIMABLE), 3156 global_page_state(NR_SLAB_UNRECLAIMABLE),
3157 global_page_state(NR_FILE_MAPPED), 3157 global_page_state(NR_FILE_MAPPED),
3158 global_page_state(NR_SHMEM), 3158 global_page_state(NR_SHMEM),
3159 global_page_state(NR_PAGETABLE), 3159 global_page_state(NR_PAGETABLE),
3160 global_page_state(NR_BOUNCE), 3160 global_page_state(NR_BOUNCE),
3161 global_page_state(NR_FREE_CMA_PAGES)); 3161 global_page_state(NR_FREE_CMA_PAGES));
3162 3162
3163 for_each_populated_zone(zone) { 3163 for_each_populated_zone(zone) {
3164 int i; 3164 int i;
3165 3165
3166 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3166 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3167 continue; 3167 continue;
3168 show_node(zone); 3168 show_node(zone);
3169 printk("%s" 3169 printk("%s"
3170 " free:%lukB" 3170 " free:%lukB"
3171 " min:%lukB" 3171 " min:%lukB"
3172 " low:%lukB" 3172 " low:%lukB"
3173 " high:%lukB" 3173 " high:%lukB"
3174 " active_anon:%lukB" 3174 " active_anon:%lukB"
3175 " inactive_anon:%lukB" 3175 " inactive_anon:%lukB"
3176 " active_file:%lukB" 3176 " active_file:%lukB"
3177 " inactive_file:%lukB" 3177 " inactive_file:%lukB"
3178 " unevictable:%lukB" 3178 " unevictable:%lukB"
3179 " isolated(anon):%lukB" 3179 " isolated(anon):%lukB"
3180 " isolated(file):%lukB" 3180 " isolated(file):%lukB"
3181 " present:%lukB" 3181 " present:%lukB"
3182 " managed:%lukB" 3182 " managed:%lukB"
3183 " mlocked:%lukB" 3183 " mlocked:%lukB"
3184 " dirty:%lukB" 3184 " dirty:%lukB"
3185 " writeback:%lukB" 3185 " writeback:%lukB"
3186 " mapped:%lukB" 3186 " mapped:%lukB"
3187 " shmem:%lukB" 3187 " shmem:%lukB"
3188 " slab_reclaimable:%lukB" 3188 " slab_reclaimable:%lukB"
3189 " slab_unreclaimable:%lukB" 3189 " slab_unreclaimable:%lukB"
3190 " kernel_stack:%lukB" 3190 " kernel_stack:%lukB"
3191 " pagetables:%lukB" 3191 " pagetables:%lukB"
3192 " unstable:%lukB" 3192 " unstable:%lukB"
3193 " bounce:%lukB" 3193 " bounce:%lukB"
3194 " free_cma:%lukB" 3194 " free_cma:%lukB"
3195 " writeback_tmp:%lukB" 3195 " writeback_tmp:%lukB"
3196 " pages_scanned:%lu" 3196 " pages_scanned:%lu"
3197 " all_unreclaimable? %s" 3197 " all_unreclaimable? %s"
3198 "\n", 3198 "\n",
3199 zone->name, 3199 zone->name,
3200 K(zone_page_state(zone, NR_FREE_PAGES)), 3200 K(zone_page_state(zone, NR_FREE_PAGES)),
3201 K(min_wmark_pages(zone)), 3201 K(min_wmark_pages(zone)),
3202 K(low_wmark_pages(zone)), 3202 K(low_wmark_pages(zone)),
3203 K(high_wmark_pages(zone)), 3203 K(high_wmark_pages(zone)),
3204 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3204 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3205 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3205 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3206 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3206 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3207 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3207 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3208 K(zone_page_state(zone, NR_UNEVICTABLE)), 3208 K(zone_page_state(zone, NR_UNEVICTABLE)),
3209 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3209 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3210 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3210 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3211 K(zone->present_pages), 3211 K(zone->present_pages),
3212 K(zone->managed_pages), 3212 K(zone->managed_pages),
3213 K(zone_page_state(zone, NR_MLOCK)), 3213 K(zone_page_state(zone, NR_MLOCK)),
3214 K(zone_page_state(zone, NR_FILE_DIRTY)), 3214 K(zone_page_state(zone, NR_FILE_DIRTY)),
3215 K(zone_page_state(zone, NR_WRITEBACK)), 3215 K(zone_page_state(zone, NR_WRITEBACK)),
3216 K(zone_page_state(zone, NR_FILE_MAPPED)), 3216 K(zone_page_state(zone, NR_FILE_MAPPED)),
3217 K(zone_page_state(zone, NR_SHMEM)), 3217 K(zone_page_state(zone, NR_SHMEM)),
3218 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3218 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3219 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3219 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3220 zone_page_state(zone, NR_KERNEL_STACK) * 3220 zone_page_state(zone, NR_KERNEL_STACK) *
3221 THREAD_SIZE / 1024, 3221 THREAD_SIZE / 1024,
3222 K(zone_page_state(zone, NR_PAGETABLE)), 3222 K(zone_page_state(zone, NR_PAGETABLE)),
3223 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3223 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3224 K(zone_page_state(zone, NR_BOUNCE)), 3224 K(zone_page_state(zone, NR_BOUNCE)),
3225 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3225 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3226 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3226 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3227 K(zone_page_state(zone, NR_PAGES_SCANNED)), 3227 K(zone_page_state(zone, NR_PAGES_SCANNED)),
3228 (!zone_reclaimable(zone) ? "yes" : "no") 3228 (!zone_reclaimable(zone) ? "yes" : "no")
3229 ); 3229 );
3230 printk("lowmem_reserve[]:"); 3230 printk("lowmem_reserve[]:");
3231 for (i = 0; i < MAX_NR_ZONES; i++) 3231 for (i = 0; i < MAX_NR_ZONES; i++)
3232 printk(" %ld", zone->lowmem_reserve[i]); 3232 printk(" %ld", zone->lowmem_reserve[i]);
3233 printk("\n"); 3233 printk("\n");
3234 } 3234 }
3235 3235
3236 for_each_populated_zone(zone) { 3236 for_each_populated_zone(zone) {
3237 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3237 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3238 unsigned char types[MAX_ORDER]; 3238 unsigned char types[MAX_ORDER];
3239 3239
3240 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3240 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3241 continue; 3241 continue;
3242 show_node(zone); 3242 show_node(zone);
3243 printk("%s: ", zone->name); 3243 printk("%s: ", zone->name);
3244 3244
3245 spin_lock_irqsave(&zone->lock, flags); 3245 spin_lock_irqsave(&zone->lock, flags);
3246 for (order = 0; order < MAX_ORDER; order++) { 3246 for (order = 0; order < MAX_ORDER; order++) {
3247 struct free_area *area = &zone->free_area[order]; 3247 struct free_area *area = &zone->free_area[order];
3248 int type; 3248 int type;
3249 3249
3250 nr[order] = area->nr_free; 3250 nr[order] = area->nr_free;
3251 total += nr[order] << order; 3251 total += nr[order] << order;
3252 3252
3253 types[order] = 0; 3253 types[order] = 0;
3254 for (type = 0; type < MIGRATE_TYPES; type++) { 3254 for (type = 0; type < MIGRATE_TYPES; type++) {
3255 if (!list_empty(&area->free_list[type])) 3255 if (!list_empty(&area->free_list[type]))
3256 types[order] |= 1 << type; 3256 types[order] |= 1 << type;
3257 } 3257 }
3258 } 3258 }
3259 spin_unlock_irqrestore(&zone->lock, flags); 3259 spin_unlock_irqrestore(&zone->lock, flags);
3260 for (order = 0; order < MAX_ORDER; order++) { 3260 for (order = 0; order < MAX_ORDER; order++) {
3261 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3261 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3262 if (nr[order]) 3262 if (nr[order])
3263 show_migration_types(types[order]); 3263 show_migration_types(types[order]);
3264 } 3264 }
3265 printk("= %lukB\n", K(total)); 3265 printk("= %lukB\n", K(total));
3266 } 3266 }
3267 3267
3268 hugetlb_show_meminfo(); 3268 hugetlb_show_meminfo();
3269 3269
3270 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3270 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3271 3271
3272 show_swap_cache_info(); 3272 show_swap_cache_info();
3273 } 3273 }
3274 3274
3275 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3275 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3276 { 3276 {
3277 zoneref->zone = zone; 3277 zoneref->zone = zone;
3278 zoneref->zone_idx = zone_idx(zone); 3278 zoneref->zone_idx = zone_idx(zone);
3279 } 3279 }
3280 3280
3281 /* 3281 /*
3282 * Builds allocation fallback zone lists. 3282 * Builds allocation fallback zone lists.
3283 * 3283 *
3284 * Add all populated zones of a node to the zonelist. 3284 * Add all populated zones of a node to the zonelist.
3285 */ 3285 */
3286 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3286 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3287 int nr_zones) 3287 int nr_zones)
3288 { 3288 {
3289 struct zone *zone; 3289 struct zone *zone;
3290 enum zone_type zone_type = MAX_NR_ZONES; 3290 enum zone_type zone_type = MAX_NR_ZONES;
3291 3291
3292 do { 3292 do {
3293 zone_type--; 3293 zone_type--;
3294 zone = pgdat->node_zones + zone_type; 3294 zone = pgdat->node_zones + zone_type;
3295 if (populated_zone(zone)) { 3295 if (populated_zone(zone)) {
3296 zoneref_set_zone(zone, 3296 zoneref_set_zone(zone,
3297 &zonelist->_zonerefs[nr_zones++]); 3297 &zonelist->_zonerefs[nr_zones++]);
3298 check_highest_zone(zone_type); 3298 check_highest_zone(zone_type);
3299 } 3299 }
3300 } while (zone_type); 3300 } while (zone_type);
3301 3301
3302 return nr_zones; 3302 return nr_zones;
3303 } 3303 }
3304 3304
3305 3305
3306 /* 3306 /*
3307 * zonelist_order: 3307 * zonelist_order:
3308 * 0 = automatic detection of better ordering. 3308 * 0 = automatic detection of better ordering.
3309 * 1 = order by ([node] distance, -zonetype) 3309 * 1 = order by ([node] distance, -zonetype)
3310 * 2 = order by (-zonetype, [node] distance) 3310 * 2 = order by (-zonetype, [node] distance)
3311 * 3311 *
3312 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3312 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3313 * the same zonelist. So only NUMA can configure this param. 3313 * the same zonelist. So only NUMA can configure this param.
3314 */ 3314 */
3315 #define ZONELIST_ORDER_DEFAULT 0 3315 #define ZONELIST_ORDER_DEFAULT 0
3316 #define ZONELIST_ORDER_NODE 1 3316 #define ZONELIST_ORDER_NODE 1
3317 #define ZONELIST_ORDER_ZONE 2 3317 #define ZONELIST_ORDER_ZONE 2
3318 3318
3319 /* zonelist order in the kernel. 3319 /* zonelist order in the kernel.
3320 * set_zonelist_order() will set this to NODE or ZONE. 3320 * set_zonelist_order() will set this to NODE or ZONE.
3321 */ 3321 */
3322 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3322 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3323 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3323 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3324 3324
3325 3325
3326 #ifdef CONFIG_NUMA 3326 #ifdef CONFIG_NUMA
3327 /* The value user specified ....changed by config */ 3327 /* The value user specified ....changed by config */
3328 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3328 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3329 /* string for sysctl */ 3329 /* string for sysctl */
3330 #define NUMA_ZONELIST_ORDER_LEN 16 3330 #define NUMA_ZONELIST_ORDER_LEN 16
3331 char numa_zonelist_order[16] = "default"; 3331 char numa_zonelist_order[16] = "default";
3332 3332
3333 /* 3333 /*
3334 * interface for configure zonelist ordering. 3334 * interface for configure zonelist ordering.
3335 * command line option "numa_zonelist_order" 3335 * command line option "numa_zonelist_order"
3336 * = "[dD]efault - default, automatic configuration. 3336 * = "[dD]efault - default, automatic configuration.
3337 * = "[nN]ode - order by node locality, then by zone within node 3337 * = "[nN]ode - order by node locality, then by zone within node
3338 * = "[zZ]one - order by zone, then by locality within zone 3338 * = "[zZ]one - order by zone, then by locality within zone
3339 */ 3339 */
3340 3340
3341 static int __parse_numa_zonelist_order(char *s) 3341 static int __parse_numa_zonelist_order(char *s)
3342 { 3342 {
3343 if (*s == 'd' || *s == 'D') { 3343 if (*s == 'd' || *s == 'D') {
3344 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3344 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3345 } else if (*s == 'n' || *s == 'N') { 3345 } else if (*s == 'n' || *s == 'N') {
3346 user_zonelist_order = ZONELIST_ORDER_NODE; 3346 user_zonelist_order = ZONELIST_ORDER_NODE;
3347 } else if (*s == 'z' || *s == 'Z') { 3347 } else if (*s == 'z' || *s == 'Z') {
3348 user_zonelist_order = ZONELIST_ORDER_ZONE; 3348 user_zonelist_order = ZONELIST_ORDER_ZONE;
3349 } else { 3349 } else {
3350 printk(KERN_WARNING 3350 printk(KERN_WARNING
3351 "Ignoring invalid numa_zonelist_order value: " 3351 "Ignoring invalid numa_zonelist_order value: "
3352 "%s\n", s); 3352 "%s\n", s);
3353 return -EINVAL; 3353 return -EINVAL;
3354 } 3354 }
3355 return 0; 3355 return 0;
3356 } 3356 }
3357 3357
3358 static __init int setup_numa_zonelist_order(char *s) 3358 static __init int setup_numa_zonelist_order(char *s)
3359 { 3359 {
3360 int ret; 3360 int ret;
3361 3361
3362 if (!s) 3362 if (!s)
3363 return 0; 3363 return 0;
3364 3364
3365 ret = __parse_numa_zonelist_order(s); 3365 ret = __parse_numa_zonelist_order(s);
3366 if (ret == 0) 3366 if (ret == 0)
3367 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3367 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3368 3368
3369 return ret; 3369 return ret;
3370 } 3370 }
3371 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3371 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3372 3372
3373 /* 3373 /*
3374 * sysctl handler for numa_zonelist_order 3374 * sysctl handler for numa_zonelist_order
3375 */ 3375 */
3376 int numa_zonelist_order_handler(ctl_table *table, int write, 3376 int numa_zonelist_order_handler(ctl_table *table, int write,
3377 void __user *buffer, size_t *length, 3377 void __user *buffer, size_t *length,
3378 loff_t *ppos) 3378 loff_t *ppos)
3379 { 3379 {
3380 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3380 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3381 int ret; 3381 int ret;
3382 static DEFINE_MUTEX(zl_order_mutex); 3382 static DEFINE_MUTEX(zl_order_mutex);
3383 3383
3384 mutex_lock(&zl_order_mutex); 3384 mutex_lock(&zl_order_mutex);
3385 if (write) { 3385 if (write) {
3386 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3386 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3387 ret = -EINVAL; 3387 ret = -EINVAL;
3388 goto out; 3388 goto out;
3389 } 3389 }
3390 strcpy(saved_string, (char *)table->data); 3390 strcpy(saved_string, (char *)table->data);
3391 } 3391 }
3392 ret = proc_dostring(table, write, buffer, length, ppos); 3392 ret = proc_dostring(table, write, buffer, length, ppos);
3393 if (ret) 3393 if (ret)
3394 goto out; 3394 goto out;
3395 if (write) { 3395 if (write) {
3396 int oldval = user_zonelist_order; 3396 int oldval = user_zonelist_order;
3397 3397
3398 ret = __parse_numa_zonelist_order((char *)table->data); 3398 ret = __parse_numa_zonelist_order((char *)table->data);
3399 if (ret) { 3399 if (ret) {
3400 /* 3400 /*
3401 * bogus value. restore saved string 3401 * bogus value. restore saved string
3402 */ 3402 */
3403 strncpy((char *)table->data, saved_string, 3403 strncpy((char *)table->data, saved_string,
3404 NUMA_ZONELIST_ORDER_LEN); 3404 NUMA_ZONELIST_ORDER_LEN);
3405 user_zonelist_order = oldval; 3405 user_zonelist_order = oldval;
3406 } else if (oldval != user_zonelist_order) { 3406 } else if (oldval != user_zonelist_order) {
3407 mutex_lock(&zonelists_mutex); 3407 mutex_lock(&zonelists_mutex);
3408 build_all_zonelists(NULL, NULL); 3408 build_all_zonelists(NULL, NULL);
3409 mutex_unlock(&zonelists_mutex); 3409 mutex_unlock(&zonelists_mutex);
3410 } 3410 }
3411 } 3411 }
3412 out: 3412 out:
3413 mutex_unlock(&zl_order_mutex); 3413 mutex_unlock(&zl_order_mutex);
3414 return ret; 3414 return ret;
3415 } 3415 }
3416 3416
3417 3417
3418 #define MAX_NODE_LOAD (nr_online_nodes) 3418 #define MAX_NODE_LOAD (nr_online_nodes)
3419 static int node_load[MAX_NUMNODES]; 3419 static int node_load[MAX_NUMNODES];
3420 3420
3421 /** 3421 /**
3422 * find_next_best_node - find the next node that should appear in a given node's fallback list 3422 * find_next_best_node - find the next node that should appear in a given node's fallback list
3423 * @node: node whose fallback list we're appending 3423 * @node: node whose fallback list we're appending
3424 * @used_node_mask: nodemask_t of already used nodes 3424 * @used_node_mask: nodemask_t of already used nodes
3425 * 3425 *
3426 * We use a number of factors to determine which is the next node that should 3426 * We use a number of factors to determine which is the next node that should
3427 * appear on a given node's fallback list. The node should not have appeared 3427 * appear on a given node's fallback list. The node should not have appeared
3428 * already in @node's fallback list, and it should be the next closest node 3428 * already in @node's fallback list, and it should be the next closest node
3429 * according to the distance array (which contains arbitrary distance values 3429 * according to the distance array (which contains arbitrary distance values
3430 * from each node to each node in the system), and should also prefer nodes 3430 * from each node to each node in the system), and should also prefer nodes
3431 * with no CPUs, since presumably they'll have very little allocation pressure 3431 * with no CPUs, since presumably they'll have very little allocation pressure
3432 * on them otherwise. 3432 * on them otherwise.
3433 * It returns -1 if no node is found. 3433 * It returns -1 if no node is found.
3434 */ 3434 */
3435 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3435 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3436 { 3436 {
3437 int n, val; 3437 int n, val;
3438 int min_val = INT_MAX; 3438 int min_val = INT_MAX;
3439 int best_node = NUMA_NO_NODE; 3439 int best_node = NUMA_NO_NODE;
3440 const struct cpumask *tmp = cpumask_of_node(0); 3440 const struct cpumask *tmp = cpumask_of_node(0);
3441 3441
3442 /* Use the local node if we haven't already */ 3442 /* Use the local node if we haven't already */
3443 if (!node_isset(node, *used_node_mask)) { 3443 if (!node_isset(node, *used_node_mask)) {
3444 node_set(node, *used_node_mask); 3444 node_set(node, *used_node_mask);
3445 return node; 3445 return node;
3446 } 3446 }
3447 3447
3448 for_each_node_state(n, N_MEMORY) { 3448 for_each_node_state(n, N_MEMORY) {
3449 3449
3450 /* Don't want a node to appear more than once */ 3450 /* Don't want a node to appear more than once */
3451 if (node_isset(n, *used_node_mask)) 3451 if (node_isset(n, *used_node_mask))
3452 continue; 3452 continue;
3453 3453
3454 /* Use the distance array to find the distance */ 3454 /* Use the distance array to find the distance */
3455 val = node_distance(node, n); 3455 val = node_distance(node, n);
3456 3456
3457 /* Penalize nodes under us ("prefer the next node") */ 3457 /* Penalize nodes under us ("prefer the next node") */
3458 val += (n < node); 3458 val += (n < node);
3459 3459
3460 /* Give preference to headless and unused nodes */ 3460 /* Give preference to headless and unused nodes */
3461 tmp = cpumask_of_node(n); 3461 tmp = cpumask_of_node(n);
3462 if (!cpumask_empty(tmp)) 3462 if (!cpumask_empty(tmp))
3463 val += PENALTY_FOR_NODE_WITH_CPUS; 3463 val += PENALTY_FOR_NODE_WITH_CPUS;
3464 3464
3465 /* Slight preference for less loaded node */ 3465 /* Slight preference for less loaded node */
3466 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3466 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3467 val += node_load[n]; 3467 val += node_load[n];
3468 3468
3469 if (val < min_val) { 3469 if (val < min_val) {
3470 min_val = val; 3470 min_val = val;
3471 best_node = n; 3471 best_node = n;
3472 } 3472 }
3473 } 3473 }
3474 3474
3475 if (best_node >= 0) 3475 if (best_node >= 0)
3476 node_set(best_node, *used_node_mask); 3476 node_set(best_node, *used_node_mask);
3477 3477
3478 return best_node; 3478 return best_node;
3479 } 3479 }
3480 3480
3481 3481
3482 /* 3482 /*
3483 * Build zonelists ordered by node and zones within node. 3483 * Build zonelists ordered by node and zones within node.
3484 * This results in maximum locality--normal zone overflows into local 3484 * This results in maximum locality--normal zone overflows into local
3485 * DMA zone, if any--but risks exhausting DMA zone. 3485 * DMA zone, if any--but risks exhausting DMA zone.
3486 */ 3486 */
3487 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3487 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3488 { 3488 {
3489 int j; 3489 int j;
3490 struct zonelist *zonelist; 3490 struct zonelist *zonelist;
3491 3491
3492 zonelist = &pgdat->node_zonelists[0]; 3492 zonelist = &pgdat->node_zonelists[0];
3493 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3493 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3494 ; 3494 ;
3495 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3495 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3496 zonelist->_zonerefs[j].zone = NULL; 3496 zonelist->_zonerefs[j].zone = NULL;
3497 zonelist->_zonerefs[j].zone_idx = 0; 3497 zonelist->_zonerefs[j].zone_idx = 0;
3498 } 3498 }
3499 3499
3500 /* 3500 /*
3501 * Build gfp_thisnode zonelists 3501 * Build gfp_thisnode zonelists
3502 */ 3502 */
3503 static void build_thisnode_zonelists(pg_data_t *pgdat) 3503 static void build_thisnode_zonelists(pg_data_t *pgdat)
3504 { 3504 {
3505 int j; 3505 int j;
3506 struct zonelist *zonelist; 3506 struct zonelist *zonelist;
3507 3507
3508 zonelist = &pgdat->node_zonelists[1]; 3508 zonelist = &pgdat->node_zonelists[1];
3509 j = build_zonelists_node(pgdat, zonelist, 0); 3509 j = build_zonelists_node(pgdat, zonelist, 0);
3510 zonelist->_zonerefs[j].zone = NULL; 3510 zonelist->_zonerefs[j].zone = NULL;
3511 zonelist->_zonerefs[j].zone_idx = 0; 3511 zonelist->_zonerefs[j].zone_idx = 0;
3512 } 3512 }
3513 3513
3514 /* 3514 /*
3515 * Build zonelists ordered by zone and nodes within zones. 3515 * Build zonelists ordered by zone and nodes within zones.
3516 * This results in conserving DMA zone[s] until all Normal memory is 3516 * This results in conserving DMA zone[s] until all Normal memory is
3517 * exhausted, but results in overflowing to remote node while memory 3517 * exhausted, but results in overflowing to remote node while memory
3518 * may still exist in local DMA zone. 3518 * may still exist in local DMA zone.
3519 */ 3519 */
3520 static int node_order[MAX_NUMNODES]; 3520 static int node_order[MAX_NUMNODES];
3521 3521
3522 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3522 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3523 { 3523 {
3524 int pos, j, node; 3524 int pos, j, node;
3525 int zone_type; /* needs to be signed */ 3525 int zone_type; /* needs to be signed */
3526 struct zone *z; 3526 struct zone *z;
3527 struct zonelist *zonelist; 3527 struct zonelist *zonelist;
3528 3528
3529 zonelist = &pgdat->node_zonelists[0]; 3529 zonelist = &pgdat->node_zonelists[0];
3530 pos = 0; 3530 pos = 0;
3531 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3531 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3532 for (j = 0; j < nr_nodes; j++) { 3532 for (j = 0; j < nr_nodes; j++) {
3533 node = node_order[j]; 3533 node = node_order[j];
3534 z = &NODE_DATA(node)->node_zones[zone_type]; 3534 z = &NODE_DATA(node)->node_zones[zone_type];
3535 if (populated_zone(z)) { 3535 if (populated_zone(z)) {
3536 zoneref_set_zone(z, 3536 zoneref_set_zone(z,
3537 &zonelist->_zonerefs[pos++]); 3537 &zonelist->_zonerefs[pos++]);
3538 check_highest_zone(zone_type); 3538 check_highest_zone(zone_type);
3539 } 3539 }
3540 } 3540 }
3541 } 3541 }
3542 zonelist->_zonerefs[pos].zone = NULL; 3542 zonelist->_zonerefs[pos].zone = NULL;
3543 zonelist->_zonerefs[pos].zone_idx = 0; 3543 zonelist->_zonerefs[pos].zone_idx = 0;
3544 } 3544 }
3545 3545
3546 static int default_zonelist_order(void) 3546 static int default_zonelist_order(void)
3547 { 3547 {
3548 int nid, zone_type; 3548 int nid, zone_type;
3549 unsigned long low_kmem_size, total_size; 3549 unsigned long low_kmem_size, total_size;
3550 struct zone *z; 3550 struct zone *z;
3551 int average_size; 3551 int average_size;
3552 /* 3552 /*
3553 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3553 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3554 * If they are really small and used heavily, the system can fall 3554 * If they are really small and used heavily, the system can fall
3555 * into OOM very easily. 3555 * into OOM very easily.
3556 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3556 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3557 */ 3557 */
3558 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3558 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3559 low_kmem_size = 0; 3559 low_kmem_size = 0;
3560 total_size = 0; 3560 total_size = 0;
3561 for_each_online_node(nid) { 3561 for_each_online_node(nid) {
3562 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3562 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3563 z = &NODE_DATA(nid)->node_zones[zone_type]; 3563 z = &NODE_DATA(nid)->node_zones[zone_type];
3564 if (populated_zone(z)) { 3564 if (populated_zone(z)) {
3565 if (zone_type < ZONE_NORMAL) 3565 if (zone_type < ZONE_NORMAL)
3566 low_kmem_size += z->managed_pages; 3566 low_kmem_size += z->managed_pages;
3567 total_size += z->managed_pages; 3567 total_size += z->managed_pages;
3568 } else if (zone_type == ZONE_NORMAL) { 3568 } else if (zone_type == ZONE_NORMAL) {
3569 /* 3569 /*
3570 * If any node has only lowmem, then node order 3570 * If any node has only lowmem, then node order
3571 * is preferred to allow kernel allocations 3571 * is preferred to allow kernel allocations
3572 * locally; otherwise, they can easily infringe 3572 * locally; otherwise, they can easily infringe
3573 * on other nodes when there is an abundance of 3573 * on other nodes when there is an abundance of
3574 * lowmem available to allocate from. 3574 * lowmem available to allocate from.
3575 */ 3575 */
3576 return ZONELIST_ORDER_NODE; 3576 return ZONELIST_ORDER_NODE;
3577 } 3577 }
3578 } 3578 }
3579 } 3579 }
3580 if (!low_kmem_size || /* there are no DMA area. */ 3580 if (!low_kmem_size || /* there are no DMA area. */
3581 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3581 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3582 return ZONELIST_ORDER_NODE; 3582 return ZONELIST_ORDER_NODE;
3583 /* 3583 /*
3584 * look into each node's config. 3584 * look into each node's config.
3585 * If there is a node whose DMA/DMA32 memory is very big area on 3585 * If there is a node whose DMA/DMA32 memory is very big area on
3586 * local memory, NODE_ORDER may be suitable. 3586 * local memory, NODE_ORDER may be suitable.
3587 */ 3587 */
3588 average_size = total_size / 3588 average_size = total_size /
3589 (nodes_weight(node_states[N_MEMORY]) + 1); 3589 (nodes_weight(node_states[N_MEMORY]) + 1);
3590 for_each_online_node(nid) { 3590 for_each_online_node(nid) {
3591 low_kmem_size = 0; 3591 low_kmem_size = 0;
3592 total_size = 0; 3592 total_size = 0;
3593 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3593 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3594 z = &NODE_DATA(nid)->node_zones[zone_type]; 3594 z = &NODE_DATA(nid)->node_zones[zone_type];
3595 if (populated_zone(z)) { 3595 if (populated_zone(z)) {
3596 if (zone_type < ZONE_NORMAL) 3596 if (zone_type < ZONE_NORMAL)
3597 low_kmem_size += z->present_pages; 3597 low_kmem_size += z->present_pages;
3598 total_size += z->present_pages; 3598 total_size += z->present_pages;
3599 } 3599 }
3600 } 3600 }
3601 if (low_kmem_size && 3601 if (low_kmem_size &&
3602 total_size > average_size && /* ignore small node */ 3602 total_size > average_size && /* ignore small node */
3603 low_kmem_size > total_size * 70/100) 3603 low_kmem_size > total_size * 70/100)
3604 return ZONELIST_ORDER_NODE; 3604 return ZONELIST_ORDER_NODE;
3605 } 3605 }
3606 return ZONELIST_ORDER_ZONE; 3606 return ZONELIST_ORDER_ZONE;
3607 } 3607 }
3608 3608
3609 static void set_zonelist_order(void) 3609 static void set_zonelist_order(void)
3610 { 3610 {
3611 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3611 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3612 current_zonelist_order = default_zonelist_order(); 3612 current_zonelist_order = default_zonelist_order();
3613 else 3613 else
3614 current_zonelist_order = user_zonelist_order; 3614 current_zonelist_order = user_zonelist_order;
3615 } 3615 }
3616 3616
3617 static void build_zonelists(pg_data_t *pgdat) 3617 static void build_zonelists(pg_data_t *pgdat)
3618 { 3618 {
3619 int j, node, load; 3619 int j, node, load;
3620 enum zone_type i; 3620 enum zone_type i;
3621 nodemask_t used_mask; 3621 nodemask_t used_mask;
3622 int local_node, prev_node; 3622 int local_node, prev_node;
3623 struct zonelist *zonelist; 3623 struct zonelist *zonelist;
3624 int order = current_zonelist_order; 3624 int order = current_zonelist_order;
3625 3625
3626 /* initialize zonelists */ 3626 /* initialize zonelists */
3627 for (i = 0; i < MAX_ZONELISTS; i++) { 3627 for (i = 0; i < MAX_ZONELISTS; i++) {
3628 zonelist = pgdat->node_zonelists + i; 3628 zonelist = pgdat->node_zonelists + i;
3629 zonelist->_zonerefs[0].zone = NULL; 3629 zonelist->_zonerefs[0].zone = NULL;
3630 zonelist->_zonerefs[0].zone_idx = 0; 3630 zonelist->_zonerefs[0].zone_idx = 0;
3631 } 3631 }
3632 3632
3633 /* NUMA-aware ordering of nodes */ 3633 /* NUMA-aware ordering of nodes */
3634 local_node = pgdat->node_id; 3634 local_node = pgdat->node_id;
3635 load = nr_online_nodes; 3635 load = nr_online_nodes;
3636 prev_node = local_node; 3636 prev_node = local_node;
3637 nodes_clear(used_mask); 3637 nodes_clear(used_mask);
3638 3638
3639 memset(node_order, 0, sizeof(node_order)); 3639 memset(node_order, 0, sizeof(node_order));
3640 j = 0; 3640 j = 0;
3641 3641
3642 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3642 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3643 /* 3643 /*
3644 * We don't want to pressure a particular node. 3644 * We don't want to pressure a particular node.
3645 * So adding penalty to the first node in same 3645 * So adding penalty to the first node in same
3646 * distance group to make it round-robin. 3646 * distance group to make it round-robin.
3647 */ 3647 */
3648 if (node_distance(local_node, node) != 3648 if (node_distance(local_node, node) !=
3649 node_distance(local_node, prev_node)) 3649 node_distance(local_node, prev_node))
3650 node_load[node] = load; 3650 node_load[node] = load;
3651 3651
3652 prev_node = node; 3652 prev_node = node;
3653 load--; 3653 load--;
3654 if (order == ZONELIST_ORDER_NODE) 3654 if (order == ZONELIST_ORDER_NODE)
3655 build_zonelists_in_node_order(pgdat, node); 3655 build_zonelists_in_node_order(pgdat, node);
3656 else 3656 else
3657 node_order[j++] = node; /* remember order */ 3657 node_order[j++] = node; /* remember order */
3658 } 3658 }
3659 3659
3660 if (order == ZONELIST_ORDER_ZONE) { 3660 if (order == ZONELIST_ORDER_ZONE) {
3661 /* calculate node order -- i.e., DMA last! */ 3661 /* calculate node order -- i.e., DMA last! */
3662 build_zonelists_in_zone_order(pgdat, j); 3662 build_zonelists_in_zone_order(pgdat, j);
3663 } 3663 }
3664 3664
3665 build_thisnode_zonelists(pgdat); 3665 build_thisnode_zonelists(pgdat);
3666 } 3666 }
3667 3667
3668 /* Construct the zonelist performance cache - see further mmzone.h */ 3668 /* Construct the zonelist performance cache - see further mmzone.h */
3669 static void build_zonelist_cache(pg_data_t *pgdat) 3669 static void build_zonelist_cache(pg_data_t *pgdat)
3670 { 3670 {
3671 struct zonelist *zonelist; 3671 struct zonelist *zonelist;
3672 struct zonelist_cache *zlc; 3672 struct zonelist_cache *zlc;
3673 struct zoneref *z; 3673 struct zoneref *z;
3674 3674
3675 zonelist = &pgdat->node_zonelists[0]; 3675 zonelist = &pgdat->node_zonelists[0];
3676 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3676 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3677 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3677 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3678 for (z = zonelist->_zonerefs; z->zone; z++) 3678 for (z = zonelist->_zonerefs; z->zone; z++)
3679 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3679 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3680 } 3680 }
3681 3681
3682 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3682 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3683 /* 3683 /*
3684 * Return node id of node used for "local" allocations. 3684 * Return node id of node used for "local" allocations.
3685 * I.e., first node id of first zone in arg node's generic zonelist. 3685 * I.e., first node id of first zone in arg node's generic zonelist.
3686 * Used for initializing percpu 'numa_mem', which is used primarily 3686 * Used for initializing percpu 'numa_mem', which is used primarily
3687 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3687 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3688 */ 3688 */
3689 int local_memory_node(int node) 3689 int local_memory_node(int node)
3690 { 3690 {
3691 struct zone *zone; 3691 struct zone *zone;
3692 3692
3693 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3693 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3694 gfp_zone(GFP_KERNEL), 3694 gfp_zone(GFP_KERNEL),
3695 NULL, 3695 NULL,
3696 &zone); 3696 &zone);
3697 return zone->node; 3697 return zone->node;
3698 } 3698 }
3699 #endif 3699 #endif
3700 3700
3701 #else /* CONFIG_NUMA */ 3701 #else /* CONFIG_NUMA */
3702 3702
3703 static void set_zonelist_order(void) 3703 static void set_zonelist_order(void)
3704 { 3704 {
3705 current_zonelist_order = ZONELIST_ORDER_ZONE; 3705 current_zonelist_order = ZONELIST_ORDER_ZONE;
3706 } 3706 }
3707 3707
3708 static void build_zonelists(pg_data_t *pgdat) 3708 static void build_zonelists(pg_data_t *pgdat)
3709 { 3709 {
3710 int node, local_node; 3710 int node, local_node;
3711 enum zone_type j; 3711 enum zone_type j;
3712 struct zonelist *zonelist; 3712 struct zonelist *zonelist;
3713 3713
3714 local_node = pgdat->node_id; 3714 local_node = pgdat->node_id;
3715 3715
3716 zonelist = &pgdat->node_zonelists[0]; 3716 zonelist = &pgdat->node_zonelists[0];
3717 j = build_zonelists_node(pgdat, zonelist, 0); 3717 j = build_zonelists_node(pgdat, zonelist, 0);
3718 3718
3719 /* 3719 /*
3720 * Now we build the zonelist so that it contains the zones 3720 * Now we build the zonelist so that it contains the zones
3721 * of all the other nodes. 3721 * of all the other nodes.
3722 * We don't want to pressure a particular node, so when 3722 * We don't want to pressure a particular node, so when
3723 * building the zones for node N, we make sure that the 3723 * building the zones for node N, we make sure that the
3724 * zones coming right after the local ones are those from 3724 * zones coming right after the local ones are those from
3725 * node N+1 (modulo N) 3725 * node N+1 (modulo N)
3726 */ 3726 */
3727 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3727 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3728 if (!node_online(node)) 3728 if (!node_online(node))
3729 continue; 3729 continue;
3730 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3730 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3731 } 3731 }
3732 for (node = 0; node < local_node; node++) { 3732 for (node = 0; node < local_node; node++) {
3733 if (!node_online(node)) 3733 if (!node_online(node))
3734 continue; 3734 continue;
3735 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3735 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3736 } 3736 }
3737 3737
3738 zonelist->_zonerefs[j].zone = NULL; 3738 zonelist->_zonerefs[j].zone = NULL;
3739 zonelist->_zonerefs[j].zone_idx = 0; 3739 zonelist->_zonerefs[j].zone_idx = 0;
3740 } 3740 }
3741 3741
3742 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3742 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3743 static void build_zonelist_cache(pg_data_t *pgdat) 3743 static void build_zonelist_cache(pg_data_t *pgdat)
3744 { 3744 {
3745 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3745 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3746 } 3746 }
3747 3747
3748 #endif /* CONFIG_NUMA */ 3748 #endif /* CONFIG_NUMA */
3749 3749
3750 /* 3750 /*
3751 * Boot pageset table. One per cpu which is going to be used for all 3751 * Boot pageset table. One per cpu which is going to be used for all
3752 * zones and all nodes. The parameters will be set in such a way 3752 * zones and all nodes. The parameters will be set in such a way
3753 * that an item put on a list will immediately be handed over to 3753 * that an item put on a list will immediately be handed over to
3754 * the buddy list. This is safe since pageset manipulation is done 3754 * the buddy list. This is safe since pageset manipulation is done
3755 * with interrupts disabled. 3755 * with interrupts disabled.
3756 * 3756 *
3757 * The boot_pagesets must be kept even after bootup is complete for 3757 * The boot_pagesets must be kept even after bootup is complete for
3758 * unused processors and/or zones. They do play a role for bootstrapping 3758 * unused processors and/or zones. They do play a role for bootstrapping
3759 * hotplugged processors. 3759 * hotplugged processors.
3760 * 3760 *
3761 * zoneinfo_show() and maybe other functions do 3761 * zoneinfo_show() and maybe other functions do
3762 * not check if the processor is online before following the pageset pointer. 3762 * not check if the processor is online before following the pageset pointer.
3763 * Other parts of the kernel may not check if the zone is available. 3763 * Other parts of the kernel may not check if the zone is available.
3764 */ 3764 */
3765 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3765 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3766 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3766 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3767 static void setup_zone_pageset(struct zone *zone); 3767 static void setup_zone_pageset(struct zone *zone);
3768 3768
3769 /* 3769 /*
3770 * Global mutex to protect against size modification of zonelists 3770 * Global mutex to protect against size modification of zonelists
3771 * as well as to serialize pageset setup for the new populated zone. 3771 * as well as to serialize pageset setup for the new populated zone.
3772 */ 3772 */
3773 DEFINE_MUTEX(zonelists_mutex); 3773 DEFINE_MUTEX(zonelists_mutex);
3774 3774
3775 /* return values int ....just for stop_machine() */ 3775 /* return values int ....just for stop_machine() */
3776 static int __build_all_zonelists(void *data) 3776 static int __build_all_zonelists(void *data)
3777 { 3777 {
3778 int nid; 3778 int nid;
3779 int cpu; 3779 int cpu;
3780 pg_data_t *self = data; 3780 pg_data_t *self = data;
3781 3781
3782 #ifdef CONFIG_NUMA 3782 #ifdef CONFIG_NUMA
3783 memset(node_load, 0, sizeof(node_load)); 3783 memset(node_load, 0, sizeof(node_load));
3784 #endif 3784 #endif
3785 3785
3786 if (self && !node_online(self->node_id)) { 3786 if (self && !node_online(self->node_id)) {
3787 build_zonelists(self); 3787 build_zonelists(self);
3788 build_zonelist_cache(self); 3788 build_zonelist_cache(self);
3789 } 3789 }
3790 3790
3791 for_each_online_node(nid) { 3791 for_each_online_node(nid) {
3792 pg_data_t *pgdat = NODE_DATA(nid); 3792 pg_data_t *pgdat = NODE_DATA(nid);
3793 3793
3794 build_zonelists(pgdat); 3794 build_zonelists(pgdat);
3795 build_zonelist_cache(pgdat); 3795 build_zonelist_cache(pgdat);
3796 } 3796 }
3797 3797
3798 /* 3798 /*
3799 * Initialize the boot_pagesets that are going to be used 3799 * Initialize the boot_pagesets that are going to be used
3800 * for bootstrapping processors. The real pagesets for 3800 * for bootstrapping processors. The real pagesets for
3801 * each zone will be allocated later when the per cpu 3801 * each zone will be allocated later when the per cpu
3802 * allocator is available. 3802 * allocator is available.
3803 * 3803 *
3804 * boot_pagesets are used also for bootstrapping offline 3804 * boot_pagesets are used also for bootstrapping offline
3805 * cpus if the system is already booted because the pagesets 3805 * cpus if the system is already booted because the pagesets
3806 * are needed to initialize allocators on a specific cpu too. 3806 * are needed to initialize allocators on a specific cpu too.
3807 * F.e. the percpu allocator needs the page allocator which 3807 * F.e. the percpu allocator needs the page allocator which
3808 * needs the percpu allocator in order to allocate its pagesets 3808 * needs the percpu allocator in order to allocate its pagesets
3809 * (a chicken-egg dilemma). 3809 * (a chicken-egg dilemma).
3810 */ 3810 */
3811 for_each_possible_cpu(cpu) { 3811 for_each_possible_cpu(cpu) {
3812 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3812 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3813 3813
3814 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3814 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3815 /* 3815 /*
3816 * We now know the "local memory node" for each node-- 3816 * We now know the "local memory node" for each node--
3817 * i.e., the node of the first zone in the generic zonelist. 3817 * i.e., the node of the first zone in the generic zonelist.
3818 * Set up numa_mem percpu variable for on-line cpus. During 3818 * Set up numa_mem percpu variable for on-line cpus. During
3819 * boot, only the boot cpu should be on-line; we'll init the 3819 * boot, only the boot cpu should be on-line; we'll init the
3820 * secondary cpus' numa_mem as they come on-line. During 3820 * secondary cpus' numa_mem as they come on-line. During
3821 * node/memory hotplug, we'll fixup all on-line cpus. 3821 * node/memory hotplug, we'll fixup all on-line cpus.
3822 */ 3822 */
3823 if (cpu_online(cpu)) 3823 if (cpu_online(cpu))
3824 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3824 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3825 #endif 3825 #endif
3826 } 3826 }
3827 3827
3828 return 0; 3828 return 0;
3829 } 3829 }
3830 3830
3831 /* 3831 /*
3832 * Called with zonelists_mutex held always 3832 * Called with zonelists_mutex held always
3833 * unless system_state == SYSTEM_BOOTING. 3833 * unless system_state == SYSTEM_BOOTING.
3834 */ 3834 */
3835 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3835 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3836 { 3836 {
3837 set_zonelist_order(); 3837 set_zonelist_order();
3838 3838
3839 if (system_state == SYSTEM_BOOTING) { 3839 if (system_state == SYSTEM_BOOTING) {
3840 __build_all_zonelists(NULL); 3840 __build_all_zonelists(NULL);
3841 mminit_verify_zonelist(); 3841 mminit_verify_zonelist();
3842 cpuset_init_current_mems_allowed(); 3842 cpuset_init_current_mems_allowed();
3843 } else { 3843 } else {
3844 #ifdef CONFIG_MEMORY_HOTPLUG 3844 #ifdef CONFIG_MEMORY_HOTPLUG
3845 if (zone) 3845 if (zone)
3846 setup_zone_pageset(zone); 3846 setup_zone_pageset(zone);
3847 #endif 3847 #endif
3848 /* we have to stop all cpus to guarantee there is no user 3848 /* we have to stop all cpus to guarantee there is no user
3849 of zonelist */ 3849 of zonelist */
3850 stop_machine(__build_all_zonelists, pgdat, NULL); 3850 stop_machine(__build_all_zonelists, pgdat, NULL);
3851 /* cpuset refresh routine should be here */ 3851 /* cpuset refresh routine should be here */
3852 } 3852 }
3853 vm_total_pages = nr_free_pagecache_pages(); 3853 vm_total_pages = nr_free_pagecache_pages();
3854 /* 3854 /*
3855 * Disable grouping by mobility if the number of pages in the 3855 * Disable grouping by mobility if the number of pages in the
3856 * system is too low to allow the mechanism to work. It would be 3856 * system is too low to allow the mechanism to work. It would be
3857 * more accurate, but expensive to check per-zone. This check is 3857 * more accurate, but expensive to check per-zone. This check is
3858 * made on memory-hotadd so a system can start with mobility 3858 * made on memory-hotadd so a system can start with mobility
3859 * disabled and enable it later 3859 * disabled and enable it later
3860 */ 3860 */
3861 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3861 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3862 page_group_by_mobility_disabled = 1; 3862 page_group_by_mobility_disabled = 1;
3863 else 3863 else
3864 page_group_by_mobility_disabled = 0; 3864 page_group_by_mobility_disabled = 0;
3865 3865
3866 printk("Built %i zonelists in %s order, mobility grouping %s. " 3866 printk("Built %i zonelists in %s order, mobility grouping %s. "
3867 "Total pages: %ld\n", 3867 "Total pages: %ld\n",
3868 nr_online_nodes, 3868 nr_online_nodes,
3869 zonelist_order_name[current_zonelist_order], 3869 zonelist_order_name[current_zonelist_order],
3870 page_group_by_mobility_disabled ? "off" : "on", 3870 page_group_by_mobility_disabled ? "off" : "on",
3871 vm_total_pages); 3871 vm_total_pages);
3872 #ifdef CONFIG_NUMA 3872 #ifdef CONFIG_NUMA
3873 printk("Policy zone: %s\n", zone_names[policy_zone]); 3873 printk("Policy zone: %s\n", zone_names[policy_zone]);
3874 #endif 3874 #endif
3875 } 3875 }
3876 3876
3877 /* 3877 /*
3878 * Helper functions to size the waitqueue hash table. 3878 * Helper functions to size the waitqueue hash table.
3879 * Essentially these want to choose hash table sizes sufficiently 3879 * Essentially these want to choose hash table sizes sufficiently
3880 * large so that collisions trying to wait on pages are rare. 3880 * large so that collisions trying to wait on pages are rare.
3881 * But in fact, the number of active page waitqueues on typical 3881 * But in fact, the number of active page waitqueues on typical
3882 * systems is ridiculously low, less than 200. So this is even 3882 * systems is ridiculously low, less than 200. So this is even
3883 * conservative, even though it seems large. 3883 * conservative, even though it seems large.
3884 * 3884 *
3885 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3885 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3886 * waitqueues, i.e. the size of the waitq table given the number of pages. 3886 * waitqueues, i.e. the size of the waitq table given the number of pages.
3887 */ 3887 */
3888 #define PAGES_PER_WAITQUEUE 256 3888 #define PAGES_PER_WAITQUEUE 256
3889 3889
3890 #ifndef CONFIG_MEMORY_HOTPLUG 3890 #ifndef CONFIG_MEMORY_HOTPLUG
3891 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3891 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3892 { 3892 {
3893 unsigned long size = 1; 3893 unsigned long size = 1;
3894 3894
3895 pages /= PAGES_PER_WAITQUEUE; 3895 pages /= PAGES_PER_WAITQUEUE;
3896 3896
3897 while (size < pages) 3897 while (size < pages)
3898 size <<= 1; 3898 size <<= 1;
3899 3899
3900 /* 3900 /*
3901 * Once we have dozens or even hundreds of threads sleeping 3901 * Once we have dozens or even hundreds of threads sleeping
3902 * on IO we've got bigger problems than wait queue collision. 3902 * on IO we've got bigger problems than wait queue collision.
3903 * Limit the size of the wait table to a reasonable size. 3903 * Limit the size of the wait table to a reasonable size.
3904 */ 3904 */
3905 size = min(size, 4096UL); 3905 size = min(size, 4096UL);
3906 3906
3907 return max(size, 4UL); 3907 return max(size, 4UL);
3908 } 3908 }
3909 #else 3909 #else
3910 /* 3910 /*
3911 * A zone's size might be changed by hot-add, so it is not possible to determine 3911 * A zone's size might be changed by hot-add, so it is not possible to determine
3912 * a suitable size for its wait_table. So we use the maximum size now. 3912 * a suitable size for its wait_table. So we use the maximum size now.
3913 * 3913 *
3914 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3914 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3915 * 3915 *
3916 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3916 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3917 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3917 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3918 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3918 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3919 * 3919 *
3920 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3920 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3921 * or more by the traditional way. (See above). It equals: 3921 * or more by the traditional way. (See above). It equals:
3922 * 3922 *
3923 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3923 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3924 * ia64(16K page size) : = ( 8G + 4M)byte. 3924 * ia64(16K page size) : = ( 8G + 4M)byte.
3925 * powerpc (64K page size) : = (32G +16M)byte. 3925 * powerpc (64K page size) : = (32G +16M)byte.
3926 */ 3926 */
3927 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3927 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3928 { 3928 {
3929 return 4096UL; 3929 return 4096UL;
3930 } 3930 }
3931 #endif 3931 #endif
3932 3932
3933 /* 3933 /*
3934 * This is an integer logarithm so that shifts can be used later 3934 * This is an integer logarithm so that shifts can be used later
3935 * to extract the more random high bits from the multiplicative 3935 * to extract the more random high bits from the multiplicative
3936 * hash function before the remainder is taken. 3936 * hash function before the remainder is taken.
3937 */ 3937 */
3938 static inline unsigned long wait_table_bits(unsigned long size) 3938 static inline unsigned long wait_table_bits(unsigned long size)
3939 { 3939 {
3940 return ffz(~size); 3940 return ffz(~size);
3941 } 3941 }
3942 3942
3943 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3943 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3944 3944
3945 /* 3945 /*
3946 * Check if a pageblock contains reserved pages 3946 * Check if a pageblock contains reserved pages
3947 */ 3947 */
3948 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3948 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3949 { 3949 {
3950 unsigned long pfn; 3950 unsigned long pfn;
3951 3951
3952 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3952 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3953 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3953 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3954 return 1; 3954 return 1;
3955 } 3955 }
3956 return 0; 3956 return 0;
3957 } 3957 }
3958 3958
3959 /* 3959 /*
3960 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3960 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3961 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3961 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3962 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3962 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3963 * higher will lead to a bigger reserve which will get freed as contiguous 3963 * higher will lead to a bigger reserve which will get freed as contiguous
3964 * blocks as reclaim kicks in 3964 * blocks as reclaim kicks in
3965 */ 3965 */
3966 static void setup_zone_migrate_reserve(struct zone *zone) 3966 static void setup_zone_migrate_reserve(struct zone *zone)
3967 { 3967 {
3968 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3968 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3969 struct page *page; 3969 struct page *page;
3970 unsigned long block_migratetype; 3970 unsigned long block_migratetype;
3971 int reserve; 3971 int reserve;
3972 int old_reserve; 3972 int old_reserve;
3973 3973
3974 /* 3974 /*
3975 * Get the start pfn, end pfn and the number of blocks to reserve 3975 * Get the start pfn, end pfn and the number of blocks to reserve
3976 * We have to be careful to be aligned to pageblock_nr_pages to 3976 * We have to be careful to be aligned to pageblock_nr_pages to
3977 * make sure that we always check pfn_valid for the first page in 3977 * make sure that we always check pfn_valid for the first page in
3978 * the block. 3978 * the block.
3979 */ 3979 */
3980 start_pfn = zone->zone_start_pfn; 3980 start_pfn = zone->zone_start_pfn;
3981 end_pfn = zone_end_pfn(zone); 3981 end_pfn = zone_end_pfn(zone);
3982 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3982 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3983 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3983 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3984 pageblock_order; 3984 pageblock_order;
3985 3985
3986 /* 3986 /*
3987 * Reserve blocks are generally in place to help high-order atomic 3987 * Reserve blocks are generally in place to help high-order atomic
3988 * allocations that are short-lived. A min_free_kbytes value that 3988 * allocations that are short-lived. A min_free_kbytes value that
3989 * would result in more than 2 reserve blocks for atomic allocations 3989 * would result in more than 2 reserve blocks for atomic allocations
3990 * is assumed to be in place to help anti-fragmentation for the 3990 * is assumed to be in place to help anti-fragmentation for the
3991 * future allocation of hugepages at runtime. 3991 * future allocation of hugepages at runtime.
3992 */ 3992 */
3993 reserve = min(2, reserve); 3993 reserve = min(2, reserve);
3994 old_reserve = zone->nr_migrate_reserve_block; 3994 old_reserve = zone->nr_migrate_reserve_block;
3995 3995
3996 /* When memory hot-add, we almost always need to do nothing */ 3996 /* When memory hot-add, we almost always need to do nothing */
3997 if (reserve == old_reserve) 3997 if (reserve == old_reserve)
3998 return; 3998 return;
3999 zone->nr_migrate_reserve_block = reserve; 3999 zone->nr_migrate_reserve_block = reserve;
4000 4000
4001 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 4001 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
4002 if (!pfn_valid(pfn)) 4002 if (!pfn_valid(pfn))
4003 continue; 4003 continue;
4004 page = pfn_to_page(pfn); 4004 page = pfn_to_page(pfn);
4005 4005
4006 /* Watch out for overlapping nodes */ 4006 /* Watch out for overlapping nodes */
4007 if (page_to_nid(page) != zone_to_nid(zone)) 4007 if (page_to_nid(page) != zone_to_nid(zone))
4008 continue; 4008 continue;
4009 4009
4010 block_migratetype = get_pageblock_migratetype(page); 4010 block_migratetype = get_pageblock_migratetype(page);
4011 4011
4012 /* Only test what is necessary when the reserves are not met */ 4012 /* Only test what is necessary when the reserves are not met */
4013 if (reserve > 0) { 4013 if (reserve > 0) {
4014 /* 4014 /*
4015 * Blocks with reserved pages will never free, skip 4015 * Blocks with reserved pages will never free, skip
4016 * them. 4016 * them.
4017 */ 4017 */
4018 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 4018 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
4019 if (pageblock_is_reserved(pfn, block_end_pfn)) 4019 if (pageblock_is_reserved(pfn, block_end_pfn))
4020 continue; 4020 continue;
4021 4021
4022 /* If this block is reserved, account for it */ 4022 /* If this block is reserved, account for it */
4023 if (block_migratetype == MIGRATE_RESERVE) { 4023 if (block_migratetype == MIGRATE_RESERVE) {
4024 reserve--; 4024 reserve--;
4025 continue; 4025 continue;
4026 } 4026 }
4027 4027
4028 /* Suitable for reserving if this block is movable */ 4028 /* Suitable for reserving if this block is movable */
4029 if (block_migratetype == MIGRATE_MOVABLE) { 4029 if (block_migratetype == MIGRATE_MOVABLE) {
4030 set_pageblock_migratetype(page, 4030 set_pageblock_migratetype(page,
4031 MIGRATE_RESERVE); 4031 MIGRATE_RESERVE);
4032 move_freepages_block(zone, page, 4032 move_freepages_block(zone, page,
4033 MIGRATE_RESERVE); 4033 MIGRATE_RESERVE);
4034 reserve--; 4034 reserve--;
4035 continue; 4035 continue;
4036 } 4036 }
4037 } else if (!old_reserve) { 4037 } else if (!old_reserve) {
4038 /* 4038 /*
4039 * At boot time we don't need to scan the whole zone 4039 * At boot time we don't need to scan the whole zone
4040 * for turning off MIGRATE_RESERVE. 4040 * for turning off MIGRATE_RESERVE.
4041 */ 4041 */
4042 break; 4042 break;
4043 } 4043 }
4044 4044
4045 /* 4045 /*
4046 * If the reserve is met and this is a previous reserved block, 4046 * If the reserve is met and this is a previous reserved block,
4047 * take it back 4047 * take it back
4048 */ 4048 */
4049 if (block_migratetype == MIGRATE_RESERVE) { 4049 if (block_migratetype == MIGRATE_RESERVE) {
4050 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4050 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4051 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4051 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4052 } 4052 }
4053 } 4053 }
4054 } 4054 }
4055 4055
4056 /* 4056 /*
4057 * Initially all pages are reserved - free ones are freed 4057 * Initially all pages are reserved - free ones are freed
4058 * up by free_all_bootmem() once the early boot process is 4058 * up by free_all_bootmem() once the early boot process is
4059 * done. Non-atomic initialization, single-pass. 4059 * done. Non-atomic initialization, single-pass.
4060 */ 4060 */
4061 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4061 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4062 unsigned long start_pfn, enum memmap_context context) 4062 unsigned long start_pfn, enum memmap_context context)
4063 { 4063 {
4064 struct page *page; 4064 struct page *page;
4065 unsigned long end_pfn = start_pfn + size; 4065 unsigned long end_pfn = start_pfn + size;
4066 unsigned long pfn; 4066 unsigned long pfn;
4067 struct zone *z; 4067 struct zone *z;
4068 4068
4069 if (highest_memmap_pfn < end_pfn - 1) 4069 if (highest_memmap_pfn < end_pfn - 1)
4070 highest_memmap_pfn = end_pfn - 1; 4070 highest_memmap_pfn = end_pfn - 1;
4071 4071
4072 z = &NODE_DATA(nid)->node_zones[zone]; 4072 z = &NODE_DATA(nid)->node_zones[zone];
4073 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4073 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4074 /* 4074 /*
4075 * There can be holes in boot-time mem_map[]s 4075 * There can be holes in boot-time mem_map[]s
4076 * handed to this function. They do not 4076 * handed to this function. They do not
4077 * exist on hotplugged memory. 4077 * exist on hotplugged memory.
4078 */ 4078 */
4079 if (context == MEMMAP_EARLY) { 4079 if (context == MEMMAP_EARLY) {
4080 if (!early_pfn_valid(pfn)) 4080 if (!early_pfn_valid(pfn))
4081 continue; 4081 continue;
4082 if (!early_pfn_in_nid(pfn, nid)) 4082 if (!early_pfn_in_nid(pfn, nid))
4083 continue; 4083 continue;
4084 } 4084 }
4085 page = pfn_to_page(pfn); 4085 page = pfn_to_page(pfn);
4086 set_page_links(page, zone, nid, pfn); 4086 set_page_links(page, zone, nid, pfn);
4087 mminit_verify_page_links(page, zone, nid, pfn); 4087 mminit_verify_page_links(page, zone, nid, pfn);
4088 init_page_count(page); 4088 init_page_count(page);
4089 page_mapcount_reset(page); 4089 page_mapcount_reset(page);
4090 page_nid_reset_last(page); 4090 page_nid_reset_last(page);
4091 SetPageReserved(page); 4091 SetPageReserved(page);
4092 /* 4092 /*
4093 * Mark the block movable so that blocks are reserved for 4093 * Mark the block movable so that blocks are reserved for
4094 * movable at startup. This will force kernel allocations 4094 * movable at startup. This will force kernel allocations
4095 * to reserve their blocks rather than leaking throughout 4095 * to reserve their blocks rather than leaking throughout
4096 * the address space during boot when many long-lived 4096 * the address space during boot when many long-lived
4097 * kernel allocations are made. Later some blocks near 4097 * kernel allocations are made. Later some blocks near
4098 * the start are marked MIGRATE_RESERVE by 4098 * the start are marked MIGRATE_RESERVE by
4099 * setup_zone_migrate_reserve() 4099 * setup_zone_migrate_reserve()
4100 * 4100 *
4101 * bitmap is created for zone's valid pfn range. but memmap 4101 * bitmap is created for zone's valid pfn range. but memmap
4102 * can be created for invalid pages (for alignment) 4102 * can be created for invalid pages (for alignment)
4103 * check here not to call set_pageblock_migratetype() against 4103 * check here not to call set_pageblock_migratetype() against
4104 * pfn out of zone. 4104 * pfn out of zone.
4105 */ 4105 */
4106 if ((z->zone_start_pfn <= pfn) 4106 if ((z->zone_start_pfn <= pfn)
4107 && (pfn < zone_end_pfn(z)) 4107 && (pfn < zone_end_pfn(z))
4108 && !(pfn & (pageblock_nr_pages - 1))) 4108 && !(pfn & (pageblock_nr_pages - 1)))
4109 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4109 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4110 4110
4111 INIT_LIST_HEAD(&page->lru); 4111 INIT_LIST_HEAD(&page->lru);
4112 #ifdef WANT_PAGE_VIRTUAL 4112 #ifdef WANT_PAGE_VIRTUAL
4113 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4113 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4114 if (!is_highmem_idx(zone)) 4114 if (!is_highmem_idx(zone))
4115 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4115 set_page_address(page, __va(pfn << PAGE_SHIFT));
4116 #endif 4116 #endif
4117 } 4117 }
4118 } 4118 }
4119 4119
4120 static void __meminit zone_init_free_lists(struct zone *zone) 4120 static void __meminit zone_init_free_lists(struct zone *zone)
4121 { 4121 {
4122 unsigned int order, t; 4122 unsigned int order, t;
4123 for_each_migratetype_order(order, t) { 4123 for_each_migratetype_order(order, t) {
4124 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4124 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4125 zone->free_area[order].nr_free = 0; 4125 zone->free_area[order].nr_free = 0;
4126 } 4126 }
4127 } 4127 }
4128 4128
4129 #ifndef __HAVE_ARCH_MEMMAP_INIT 4129 #ifndef __HAVE_ARCH_MEMMAP_INIT
4130 #define memmap_init(size, nid, zone, start_pfn) \ 4130 #define memmap_init(size, nid, zone, start_pfn) \
4131 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4131 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4132 #endif 4132 #endif
4133 4133
4134 static int zone_batchsize(struct zone *zone) 4134 static int zone_batchsize(struct zone *zone)
4135 { 4135 {
4136 #ifdef CONFIG_MMU 4136 #ifdef CONFIG_MMU
4137 int batch; 4137 int batch;
4138 4138
4139 /* 4139 /*
4140 * The per-cpu-pages pools are set to around 1000th of the 4140 * The per-cpu-pages pools are set to around 1000th of the
4141 * size of the zone. But no more than 1/2 of a meg. 4141 * size of the zone. But no more than 1/2 of a meg.
4142 * 4142 *
4143 * OK, so we don't know how big the cache is. So guess. 4143 * OK, so we don't know how big the cache is. So guess.
4144 */ 4144 */
4145 batch = zone->managed_pages / 1024; 4145 batch = zone->managed_pages / 1024;
4146 if (batch * PAGE_SIZE > 512 * 1024) 4146 if (batch * PAGE_SIZE > 512 * 1024)
4147 batch = (512 * 1024) / PAGE_SIZE; 4147 batch = (512 * 1024) / PAGE_SIZE;
4148 batch /= 4; /* We effectively *= 4 below */ 4148 batch /= 4; /* We effectively *= 4 below */
4149 if (batch < 1) 4149 if (batch < 1)
4150 batch = 1; 4150 batch = 1;
4151 4151
4152 /* 4152 /*
4153 * Clamp the batch to a 2^n - 1 value. Having a power 4153 * Clamp the batch to a 2^n - 1 value. Having a power
4154 * of 2 value was found to be more likely to have 4154 * of 2 value was found to be more likely to have
4155 * suboptimal cache aliasing properties in some cases. 4155 * suboptimal cache aliasing properties in some cases.
4156 * 4156 *
4157 * For example if 2 tasks are alternately allocating 4157 * For example if 2 tasks are alternately allocating
4158 * batches of pages, one task can end up with a lot 4158 * batches of pages, one task can end up with a lot
4159 * of pages of one half of the possible page colors 4159 * of pages of one half of the possible page colors
4160 * and the other with pages of the other colors. 4160 * and the other with pages of the other colors.
4161 */ 4161 */
4162 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4162 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4163 4163
4164 return batch; 4164 return batch;
4165 4165
4166 #else 4166 #else
4167 /* The deferral and batching of frees should be suppressed under NOMMU 4167 /* The deferral and batching of frees should be suppressed under NOMMU
4168 * conditions. 4168 * conditions.
4169 * 4169 *
4170 * The problem is that NOMMU needs to be able to allocate large chunks 4170 * The problem is that NOMMU needs to be able to allocate large chunks
4171 * of contiguous memory as there's no hardware page translation to 4171 * of contiguous memory as there's no hardware page translation to
4172 * assemble apparent contiguous memory from discontiguous pages. 4172 * assemble apparent contiguous memory from discontiguous pages.
4173 * 4173 *
4174 * Queueing large contiguous runs of pages for batching, however, 4174 * Queueing large contiguous runs of pages for batching, however,
4175 * causes the pages to actually be freed in smaller chunks. As there 4175 * causes the pages to actually be freed in smaller chunks. As there
4176 * can be a significant delay between the individual batches being 4176 * can be a significant delay between the individual batches being
4177 * recycled, this leads to the once large chunks of space being 4177 * recycled, this leads to the once large chunks of space being
4178 * fragmented and becoming unavailable for high-order allocations. 4178 * fragmented and becoming unavailable for high-order allocations.
4179 */ 4179 */
4180 return 0; 4180 return 0;
4181 #endif 4181 #endif
4182 } 4182 }
4183 4183
4184 /* 4184 /*
4185 * pcp->high and pcp->batch values are related and dependent on one another: 4185 * pcp->high and pcp->batch values are related and dependent on one another:
4186 * ->batch must never be higher then ->high. 4186 * ->batch must never be higher then ->high.
4187 * The following function updates them in a safe manner without read side 4187 * The following function updates them in a safe manner without read side
4188 * locking. 4188 * locking.
4189 * 4189 *
4190 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4190 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4191 * those fields changing asynchronously (acording the the above rule). 4191 * those fields changing asynchronously (acording the the above rule).
4192 * 4192 *
4193 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4193 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4194 * outside of boot time (or some other assurance that no concurrent updaters 4194 * outside of boot time (or some other assurance that no concurrent updaters
4195 * exist). 4195 * exist).
4196 */ 4196 */
4197 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4197 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4198 unsigned long batch) 4198 unsigned long batch)
4199 { 4199 {
4200 /* start with a fail safe value for batch */ 4200 /* start with a fail safe value for batch */
4201 pcp->batch = 1; 4201 pcp->batch = 1;
4202 smp_wmb(); 4202 smp_wmb();
4203 4203
4204 /* Update high, then batch, in order */ 4204 /* Update high, then batch, in order */
4205 pcp->high = high; 4205 pcp->high = high;
4206 smp_wmb(); 4206 smp_wmb();
4207 4207
4208 pcp->batch = batch; 4208 pcp->batch = batch;
4209 } 4209 }
4210 4210
4211 /* a companion to pageset_set_high() */ 4211 /* a companion to pageset_set_high() */
4212 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4212 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4213 { 4213 {
4214 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4214 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4215 } 4215 }
4216 4216
4217 static void pageset_init(struct per_cpu_pageset *p) 4217 static void pageset_init(struct per_cpu_pageset *p)
4218 { 4218 {
4219 struct per_cpu_pages *pcp; 4219 struct per_cpu_pages *pcp;
4220 int migratetype; 4220 int migratetype;
4221 4221
4222 memset(p, 0, sizeof(*p)); 4222 memset(p, 0, sizeof(*p));
4223 4223
4224 pcp = &p->pcp; 4224 pcp = &p->pcp;
4225 pcp->count = 0; 4225 pcp->count = 0;
4226 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4226 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4227 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4227 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4228 } 4228 }
4229 4229
4230 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4230 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4231 { 4231 {
4232 pageset_init(p); 4232 pageset_init(p);
4233 pageset_set_batch(p, batch); 4233 pageset_set_batch(p, batch);
4234 } 4234 }
4235 4235
4236 /* 4236 /*
4237 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4237 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4238 * to the value high for the pageset p. 4238 * to the value high for the pageset p.
4239 */ 4239 */
4240 static void pageset_set_high(struct per_cpu_pageset *p, 4240 static void pageset_set_high(struct per_cpu_pageset *p,
4241 unsigned long high) 4241 unsigned long high)
4242 { 4242 {
4243 unsigned long batch = max(1UL, high / 4); 4243 unsigned long batch = max(1UL, high / 4);
4244 if ((high / 4) > (PAGE_SHIFT * 8)) 4244 if ((high / 4) > (PAGE_SHIFT * 8))
4245 batch = PAGE_SHIFT * 8; 4245 batch = PAGE_SHIFT * 8;
4246 4246
4247 pageset_update(&p->pcp, high, batch); 4247 pageset_update(&p->pcp, high, batch);
4248 } 4248 }
4249 4249
4250 static void pageset_set_high_and_batch(struct zone *zone, 4250 static void pageset_set_high_and_batch(struct zone *zone,
4251 struct per_cpu_pageset *pcp) 4251 struct per_cpu_pageset *pcp)
4252 { 4252 {
4253 if (percpu_pagelist_fraction) 4253 if (percpu_pagelist_fraction)
4254 pageset_set_high(pcp, 4254 pageset_set_high(pcp,
4255 (zone->managed_pages / 4255 (zone->managed_pages /
4256 percpu_pagelist_fraction)); 4256 percpu_pagelist_fraction));
4257 else 4257 else
4258 pageset_set_batch(pcp, zone_batchsize(zone)); 4258 pageset_set_batch(pcp, zone_batchsize(zone));
4259 } 4259 }
4260 4260
4261 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4261 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4262 { 4262 {
4263 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4263 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4264 4264
4265 pageset_init(pcp); 4265 pageset_init(pcp);
4266 pageset_set_high_and_batch(zone, pcp); 4266 pageset_set_high_and_batch(zone, pcp);
4267 } 4267 }
4268 4268
4269 static void __meminit setup_zone_pageset(struct zone *zone) 4269 static void __meminit setup_zone_pageset(struct zone *zone)
4270 { 4270 {
4271 int cpu; 4271 int cpu;
4272 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4272 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4273 for_each_possible_cpu(cpu) 4273 for_each_possible_cpu(cpu)
4274 zone_pageset_init(zone, cpu); 4274 zone_pageset_init(zone, cpu);
4275 } 4275 }
4276 4276
4277 /* 4277 /*
4278 * Allocate per cpu pagesets and initialize them. 4278 * Allocate per cpu pagesets and initialize them.
4279 * Before this call only boot pagesets were available. 4279 * Before this call only boot pagesets were available.
4280 */ 4280 */
4281 void __init setup_per_cpu_pageset(void) 4281 void __init setup_per_cpu_pageset(void)
4282 { 4282 {
4283 struct zone *zone; 4283 struct zone *zone;
4284 4284
4285 for_each_populated_zone(zone) 4285 for_each_populated_zone(zone)
4286 setup_zone_pageset(zone); 4286 setup_zone_pageset(zone);
4287 } 4287 }
4288 4288
4289 static noinline __init_refok 4289 static noinline __init_refok
4290 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4290 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4291 { 4291 {
4292 int i; 4292 int i;
4293 struct pglist_data *pgdat = zone->zone_pgdat; 4293 struct pglist_data *pgdat = zone->zone_pgdat;
4294 size_t alloc_size; 4294 size_t alloc_size;
4295 4295
4296 /* 4296 /*
4297 * The per-page waitqueue mechanism uses hashed waitqueues 4297 * The per-page waitqueue mechanism uses hashed waitqueues
4298 * per zone. 4298 * per zone.
4299 */ 4299 */
4300 zone->wait_table_hash_nr_entries = 4300 zone->wait_table_hash_nr_entries =
4301 wait_table_hash_nr_entries(zone_size_pages); 4301 wait_table_hash_nr_entries(zone_size_pages);
4302 zone->wait_table_bits = 4302 zone->wait_table_bits =
4303 wait_table_bits(zone->wait_table_hash_nr_entries); 4303 wait_table_bits(zone->wait_table_hash_nr_entries);
4304 alloc_size = zone->wait_table_hash_nr_entries 4304 alloc_size = zone->wait_table_hash_nr_entries
4305 * sizeof(wait_queue_head_t); 4305 * sizeof(wait_queue_head_t);
4306 4306
4307 if (!slab_is_available()) { 4307 if (!slab_is_available()) {
4308 zone->wait_table = (wait_queue_head_t *) 4308 zone->wait_table = (wait_queue_head_t *)
4309 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4309 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4310 } else { 4310 } else {
4311 /* 4311 /*
4312 * This case means that a zone whose size was 0 gets new memory 4312 * This case means that a zone whose size was 0 gets new memory
4313 * via memory hot-add. 4313 * via memory hot-add.
4314 * But it may be the case that a new node was hot-added. In 4314 * But it may be the case that a new node was hot-added. In
4315 * this case vmalloc() will not be able to use this new node's 4315 * this case vmalloc() will not be able to use this new node's
4316 * memory - this wait_table must be initialized to use this new 4316 * memory - this wait_table must be initialized to use this new
4317 * node itself as well. 4317 * node itself as well.
4318 * To use this new node's memory, further consideration will be 4318 * To use this new node's memory, further consideration will be
4319 * necessary. 4319 * necessary.
4320 */ 4320 */
4321 zone->wait_table = vmalloc(alloc_size); 4321 zone->wait_table = vmalloc(alloc_size);
4322 } 4322 }
4323 if (!zone->wait_table) 4323 if (!zone->wait_table)
4324 return -ENOMEM; 4324 return -ENOMEM;
4325 4325
4326 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4326 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4327 init_waitqueue_head(zone->wait_table + i); 4327 init_waitqueue_head(zone->wait_table + i);
4328 4328
4329 return 0; 4329 return 0;
4330 } 4330 }
4331 4331
4332 static __meminit void zone_pcp_init(struct zone *zone) 4332 static __meminit void zone_pcp_init(struct zone *zone)
4333 { 4333 {
4334 /* 4334 /*
4335 * per cpu subsystem is not up at this point. The following code 4335 * per cpu subsystem is not up at this point. The following code
4336 * relies on the ability of the linker to provide the 4336 * relies on the ability of the linker to provide the
4337 * offset of a (static) per cpu variable into the per cpu area. 4337 * offset of a (static) per cpu variable into the per cpu area.
4338 */ 4338 */
4339 zone->pageset = &boot_pageset; 4339 zone->pageset = &boot_pageset;
4340 4340
4341 if (zone->present_pages) 4341 if (zone->present_pages)
4342 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4342 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4343 zone->name, zone->present_pages, 4343 zone->name, zone->present_pages,
4344 zone_batchsize(zone)); 4344 zone_batchsize(zone));
4345 } 4345 }
4346 4346
4347 int __meminit init_currently_empty_zone(struct zone *zone, 4347 int __meminit init_currently_empty_zone(struct zone *zone,
4348 unsigned long zone_start_pfn, 4348 unsigned long zone_start_pfn,
4349 unsigned long size, 4349 unsigned long size,
4350 enum memmap_context context) 4350 enum memmap_context context)
4351 { 4351 {
4352 struct pglist_data *pgdat = zone->zone_pgdat; 4352 struct pglist_data *pgdat = zone->zone_pgdat;
4353 int ret; 4353 int ret;
4354 ret = zone_wait_table_init(zone, size); 4354 ret = zone_wait_table_init(zone, size);
4355 if (ret) 4355 if (ret)
4356 return ret; 4356 return ret;
4357 pgdat->nr_zones = zone_idx(zone) + 1; 4357 pgdat->nr_zones = zone_idx(zone) + 1;
4358 4358
4359 zone->zone_start_pfn = zone_start_pfn; 4359 zone->zone_start_pfn = zone_start_pfn;
4360 4360
4361 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4361 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4362 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4362 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4363 pgdat->node_id, 4363 pgdat->node_id,
4364 (unsigned long)zone_idx(zone), 4364 (unsigned long)zone_idx(zone),
4365 zone_start_pfn, (zone_start_pfn + size)); 4365 zone_start_pfn, (zone_start_pfn + size));
4366 4366
4367 zone_init_free_lists(zone); 4367 zone_init_free_lists(zone);
4368 4368
4369 return 0; 4369 return 0;
4370 } 4370 }
4371 4371
4372 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4372 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4373 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4373 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4374 /* 4374 /*
4375 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4375 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4376 * Architectures may implement their own version but if add_active_range() 4376 * Architectures may implement their own version but if add_active_range()
4377 * was used and there are no special requirements, this is a convenient 4377 * was used and there are no special requirements, this is a convenient
4378 * alternative 4378 * alternative
4379 */ 4379 */
4380 int __meminit __early_pfn_to_nid(unsigned long pfn) 4380 int __meminit __early_pfn_to_nid(unsigned long pfn)
4381 { 4381 {
4382 unsigned long start_pfn, end_pfn; 4382 unsigned long start_pfn, end_pfn;
4383 int nid; 4383 int nid;
4384 /* 4384 /*
4385 * NOTE: The following SMP-unsafe globals are only used early in boot 4385 * NOTE: The following SMP-unsafe globals are only used early in boot
4386 * when the kernel is running single-threaded. 4386 * when the kernel is running single-threaded.
4387 */ 4387 */
4388 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4388 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4389 static int __meminitdata last_nid; 4389 static int __meminitdata last_nid;
4390 4390
4391 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4391 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4392 return last_nid; 4392 return last_nid;
4393 4393
4394 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4394 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4395 if (nid != -1) { 4395 if (nid != -1) {
4396 last_start_pfn = start_pfn; 4396 last_start_pfn = start_pfn;
4397 last_end_pfn = end_pfn; 4397 last_end_pfn = end_pfn;
4398 last_nid = nid; 4398 last_nid = nid;
4399 } 4399 }
4400 4400
4401 return nid; 4401 return nid;
4402 } 4402 }
4403 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4403 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4404 4404
4405 int __meminit early_pfn_to_nid(unsigned long pfn) 4405 int __meminit early_pfn_to_nid(unsigned long pfn)
4406 { 4406 {
4407 int nid; 4407 int nid;
4408 4408
4409 nid = __early_pfn_to_nid(pfn); 4409 nid = __early_pfn_to_nid(pfn);
4410 if (nid >= 0) 4410 if (nid >= 0)
4411 return nid; 4411 return nid;
4412 /* just returns 0 */ 4412 /* just returns 0 */
4413 return 0; 4413 return 0;
4414 } 4414 }
4415 4415
4416 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4416 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4417 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4417 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4418 { 4418 {
4419 int nid; 4419 int nid;
4420 4420
4421 nid = __early_pfn_to_nid(pfn); 4421 nid = __early_pfn_to_nid(pfn);
4422 if (nid >= 0 && nid != node) 4422 if (nid >= 0 && nid != node)
4423 return false; 4423 return false;
4424 return true; 4424 return true;
4425 } 4425 }
4426 #endif 4426 #endif
4427 4427
4428 /** 4428 /**
4429 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4429 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4430 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4430 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4431 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4431 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4432 * 4432 *
4433 * If an architecture guarantees that all ranges registered with 4433 * If an architecture guarantees that all ranges registered with
4434 * add_active_ranges() contain no holes and may be freed, this 4434 * add_active_ranges() contain no holes and may be freed, this
4435 * this function may be used instead of calling free_bootmem() manually. 4435 * this function may be used instead of calling free_bootmem() manually.
4436 */ 4436 */
4437 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4437 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4438 { 4438 {
4439 unsigned long start_pfn, end_pfn; 4439 unsigned long start_pfn, end_pfn;
4440 int i, this_nid; 4440 int i, this_nid;
4441 4441
4442 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4442 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4443 start_pfn = min(start_pfn, max_low_pfn); 4443 start_pfn = min(start_pfn, max_low_pfn);
4444 end_pfn = min(end_pfn, max_low_pfn); 4444 end_pfn = min(end_pfn, max_low_pfn);
4445 4445
4446 if (start_pfn < end_pfn) 4446 if (start_pfn < end_pfn)
4447 free_bootmem_node(NODE_DATA(this_nid), 4447 free_bootmem_node(NODE_DATA(this_nid),
4448 PFN_PHYS(start_pfn), 4448 PFN_PHYS(start_pfn),
4449 (end_pfn - start_pfn) << PAGE_SHIFT); 4449 (end_pfn - start_pfn) << PAGE_SHIFT);
4450 } 4450 }
4451 } 4451 }
4452 4452
4453 /** 4453 /**
4454 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4454 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4455 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4455 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4456 * 4456 *
4457 * If an architecture guarantees that all ranges registered with 4457 * If an architecture guarantees that all ranges registered with
4458 * add_active_ranges() contain no holes and may be freed, this 4458 * add_active_ranges() contain no holes and may be freed, this
4459 * function may be used instead of calling memory_present() manually. 4459 * function may be used instead of calling memory_present() manually.
4460 */ 4460 */
4461 void __init sparse_memory_present_with_active_regions(int nid) 4461 void __init sparse_memory_present_with_active_regions(int nid)
4462 { 4462 {
4463 unsigned long start_pfn, end_pfn; 4463 unsigned long start_pfn, end_pfn;
4464 int i, this_nid; 4464 int i, this_nid;
4465 4465
4466 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4466 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4467 memory_present(this_nid, start_pfn, end_pfn); 4467 memory_present(this_nid, start_pfn, end_pfn);
4468 } 4468 }
4469 4469
4470 /** 4470 /**
4471 * get_pfn_range_for_nid - Return the start and end page frames for a node 4471 * get_pfn_range_for_nid - Return the start and end page frames for a node
4472 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4472 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4473 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4473 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4474 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4474 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4475 * 4475 *
4476 * It returns the start and end page frame of a node based on information 4476 * It returns the start and end page frame of a node based on information
4477 * provided by an arch calling add_active_range(). If called for a node 4477 * provided by an arch calling add_active_range(). If called for a node
4478 * with no available memory, a warning is printed and the start and end 4478 * with no available memory, a warning is printed and the start and end
4479 * PFNs will be 0. 4479 * PFNs will be 0.
4480 */ 4480 */
4481 void __meminit get_pfn_range_for_nid(unsigned int nid, 4481 void __meminit get_pfn_range_for_nid(unsigned int nid,
4482 unsigned long *start_pfn, unsigned long *end_pfn) 4482 unsigned long *start_pfn, unsigned long *end_pfn)
4483 { 4483 {
4484 unsigned long this_start_pfn, this_end_pfn; 4484 unsigned long this_start_pfn, this_end_pfn;
4485 int i; 4485 int i;
4486 4486
4487 *start_pfn = -1UL; 4487 *start_pfn = -1UL;
4488 *end_pfn = 0; 4488 *end_pfn = 0;
4489 4489
4490 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4490 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4491 *start_pfn = min(*start_pfn, this_start_pfn); 4491 *start_pfn = min(*start_pfn, this_start_pfn);
4492 *end_pfn = max(*end_pfn, this_end_pfn); 4492 *end_pfn = max(*end_pfn, this_end_pfn);
4493 } 4493 }
4494 4494
4495 if (*start_pfn == -1UL) 4495 if (*start_pfn == -1UL)
4496 *start_pfn = 0; 4496 *start_pfn = 0;
4497 } 4497 }
4498 4498
4499 /* 4499 /*
4500 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4500 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4501 * assumption is made that zones within a node are ordered in monotonic 4501 * assumption is made that zones within a node are ordered in monotonic
4502 * increasing memory addresses so that the "highest" populated zone is used 4502 * increasing memory addresses so that the "highest" populated zone is used
4503 */ 4503 */
4504 static void __init find_usable_zone_for_movable(void) 4504 static void __init find_usable_zone_for_movable(void)
4505 { 4505 {
4506 int zone_index; 4506 int zone_index;
4507 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4507 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4508 if (zone_index == ZONE_MOVABLE) 4508 if (zone_index == ZONE_MOVABLE)
4509 continue; 4509 continue;
4510 4510
4511 if (arch_zone_highest_possible_pfn[zone_index] > 4511 if (arch_zone_highest_possible_pfn[zone_index] >
4512 arch_zone_lowest_possible_pfn[zone_index]) 4512 arch_zone_lowest_possible_pfn[zone_index])
4513 break; 4513 break;
4514 } 4514 }
4515 4515
4516 VM_BUG_ON(zone_index == -1); 4516 VM_BUG_ON(zone_index == -1);
4517 movable_zone = zone_index; 4517 movable_zone = zone_index;
4518 } 4518 }
4519 4519
4520 /* 4520 /*
4521 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4521 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4522 * because it is sized independent of architecture. Unlike the other zones, 4522 * because it is sized independent of architecture. Unlike the other zones,
4523 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4523 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4524 * in each node depending on the size of each node and how evenly kernelcore 4524 * in each node depending on the size of each node and how evenly kernelcore
4525 * is distributed. This helper function adjusts the zone ranges 4525 * is distributed. This helper function adjusts the zone ranges
4526 * provided by the architecture for a given node by using the end of the 4526 * provided by the architecture for a given node by using the end of the
4527 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4527 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4528 * zones within a node are in order of monotonic increases memory addresses 4528 * zones within a node are in order of monotonic increases memory addresses
4529 */ 4529 */
4530 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4530 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4531 unsigned long zone_type, 4531 unsigned long zone_type,
4532 unsigned long node_start_pfn, 4532 unsigned long node_start_pfn,
4533 unsigned long node_end_pfn, 4533 unsigned long node_end_pfn,
4534 unsigned long *zone_start_pfn, 4534 unsigned long *zone_start_pfn,
4535 unsigned long *zone_end_pfn) 4535 unsigned long *zone_end_pfn)
4536 { 4536 {
4537 /* Only adjust if ZONE_MOVABLE is on this node */ 4537 /* Only adjust if ZONE_MOVABLE is on this node */
4538 if (zone_movable_pfn[nid]) { 4538 if (zone_movable_pfn[nid]) {
4539 /* Size ZONE_MOVABLE */ 4539 /* Size ZONE_MOVABLE */
4540 if (zone_type == ZONE_MOVABLE) { 4540 if (zone_type == ZONE_MOVABLE) {
4541 *zone_start_pfn = zone_movable_pfn[nid]; 4541 *zone_start_pfn = zone_movable_pfn[nid];
4542 *zone_end_pfn = min(node_end_pfn, 4542 *zone_end_pfn = min(node_end_pfn,
4543 arch_zone_highest_possible_pfn[movable_zone]); 4543 arch_zone_highest_possible_pfn[movable_zone]);
4544 4544
4545 /* Adjust for ZONE_MOVABLE starting within this range */ 4545 /* Adjust for ZONE_MOVABLE starting within this range */
4546 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4546 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4547 *zone_end_pfn > zone_movable_pfn[nid]) { 4547 *zone_end_pfn > zone_movable_pfn[nid]) {
4548 *zone_end_pfn = zone_movable_pfn[nid]; 4548 *zone_end_pfn = zone_movable_pfn[nid];
4549 4549
4550 /* Check if this whole range is within ZONE_MOVABLE */ 4550 /* Check if this whole range is within ZONE_MOVABLE */
4551 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4551 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4552 *zone_start_pfn = *zone_end_pfn; 4552 *zone_start_pfn = *zone_end_pfn;
4553 } 4553 }
4554 } 4554 }
4555 4555
4556 /* 4556 /*
4557 * Return the number of pages a zone spans in a node, including holes 4557 * Return the number of pages a zone spans in a node, including holes
4558 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4558 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4559 */ 4559 */
4560 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4560 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4561 unsigned long zone_type, 4561 unsigned long zone_type,
4562 unsigned long node_start_pfn, 4562 unsigned long node_start_pfn,
4563 unsigned long node_end_pfn, 4563 unsigned long node_end_pfn,
4564 unsigned long *ignored) 4564 unsigned long *ignored)
4565 { 4565 {
4566 unsigned long zone_start_pfn, zone_end_pfn; 4566 unsigned long zone_start_pfn, zone_end_pfn;
4567 4567
4568 /* Get the start and end of the zone */ 4568 /* Get the start and end of the zone */
4569 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4569 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4570 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4570 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4571 adjust_zone_range_for_zone_movable(nid, zone_type, 4571 adjust_zone_range_for_zone_movable(nid, zone_type,
4572 node_start_pfn, node_end_pfn, 4572 node_start_pfn, node_end_pfn,
4573 &zone_start_pfn, &zone_end_pfn); 4573 &zone_start_pfn, &zone_end_pfn);
4574 4574
4575 /* Check that this node has pages within the zone's required range */ 4575 /* Check that this node has pages within the zone's required range */
4576 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4576 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4577 return 0; 4577 return 0;
4578 4578
4579 /* Move the zone boundaries inside the node if necessary */ 4579 /* Move the zone boundaries inside the node if necessary */
4580 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4580 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4581 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4581 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4582 4582
4583 /* Return the spanned pages */ 4583 /* Return the spanned pages */
4584 return zone_end_pfn - zone_start_pfn; 4584 return zone_end_pfn - zone_start_pfn;
4585 } 4585 }
4586 4586
4587 /* 4587 /*
4588 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4588 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4589 * then all holes in the requested range will be accounted for. 4589 * then all holes in the requested range will be accounted for.
4590 */ 4590 */
4591 unsigned long __meminit __absent_pages_in_range(int nid, 4591 unsigned long __meminit __absent_pages_in_range(int nid,
4592 unsigned long range_start_pfn, 4592 unsigned long range_start_pfn,
4593 unsigned long range_end_pfn) 4593 unsigned long range_end_pfn)
4594 { 4594 {
4595 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4595 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4596 unsigned long start_pfn, end_pfn; 4596 unsigned long start_pfn, end_pfn;
4597 int i; 4597 int i;
4598 4598
4599 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4599 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4600 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4600 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4601 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4601 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4602 nr_absent -= end_pfn - start_pfn; 4602 nr_absent -= end_pfn - start_pfn;
4603 } 4603 }
4604 return nr_absent; 4604 return nr_absent;
4605 } 4605 }
4606 4606
4607 /** 4607 /**
4608 * absent_pages_in_range - Return number of page frames in holes within a range 4608 * absent_pages_in_range - Return number of page frames in holes within a range
4609 * @start_pfn: The start PFN to start searching for holes 4609 * @start_pfn: The start PFN to start searching for holes
4610 * @end_pfn: The end PFN to stop searching for holes 4610 * @end_pfn: The end PFN to stop searching for holes
4611 * 4611 *
4612 * It returns the number of pages frames in memory holes within a range. 4612 * It returns the number of pages frames in memory holes within a range.
4613 */ 4613 */
4614 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4614 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4615 unsigned long end_pfn) 4615 unsigned long end_pfn)
4616 { 4616 {
4617 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4617 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4618 } 4618 }
4619 4619
4620 /* Return the number of page frames in holes in a zone on a node */ 4620 /* Return the number of page frames in holes in a zone on a node */
4621 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4621 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4622 unsigned long zone_type, 4622 unsigned long zone_type,
4623 unsigned long node_start_pfn, 4623 unsigned long node_start_pfn,
4624 unsigned long node_end_pfn, 4624 unsigned long node_end_pfn,
4625 unsigned long *ignored) 4625 unsigned long *ignored)
4626 { 4626 {
4627 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4627 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4628 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4628 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4629 unsigned long zone_start_pfn, zone_end_pfn; 4629 unsigned long zone_start_pfn, zone_end_pfn;
4630 4630
4631 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4631 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4632 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4632 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4633 4633
4634 adjust_zone_range_for_zone_movable(nid, zone_type, 4634 adjust_zone_range_for_zone_movable(nid, zone_type,
4635 node_start_pfn, node_end_pfn, 4635 node_start_pfn, node_end_pfn,
4636 &zone_start_pfn, &zone_end_pfn); 4636 &zone_start_pfn, &zone_end_pfn);
4637 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4637 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4638 } 4638 }
4639 4639
4640 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4640 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4641 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4641 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4642 unsigned long zone_type, 4642 unsigned long zone_type,
4643 unsigned long node_start_pfn, 4643 unsigned long node_start_pfn,
4644 unsigned long node_end_pfn, 4644 unsigned long node_end_pfn,
4645 unsigned long *zones_size) 4645 unsigned long *zones_size)
4646 { 4646 {
4647 return zones_size[zone_type]; 4647 return zones_size[zone_type];
4648 } 4648 }
4649 4649
4650 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4650 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4651 unsigned long zone_type, 4651 unsigned long zone_type,
4652 unsigned long node_start_pfn, 4652 unsigned long node_start_pfn,
4653 unsigned long node_end_pfn, 4653 unsigned long node_end_pfn,
4654 unsigned long *zholes_size) 4654 unsigned long *zholes_size)
4655 { 4655 {
4656 if (!zholes_size) 4656 if (!zholes_size)
4657 return 0; 4657 return 0;
4658 4658
4659 return zholes_size[zone_type]; 4659 return zholes_size[zone_type];
4660 } 4660 }
4661 4661
4662 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4662 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4663 4663
4664 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4664 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4665 unsigned long node_start_pfn, 4665 unsigned long node_start_pfn,
4666 unsigned long node_end_pfn, 4666 unsigned long node_end_pfn,
4667 unsigned long *zones_size, 4667 unsigned long *zones_size,
4668 unsigned long *zholes_size) 4668 unsigned long *zholes_size)
4669 { 4669 {
4670 unsigned long realtotalpages, totalpages = 0; 4670 unsigned long realtotalpages, totalpages = 0;
4671 enum zone_type i; 4671 enum zone_type i;
4672 4672
4673 for (i = 0; i < MAX_NR_ZONES; i++) 4673 for (i = 0; i < MAX_NR_ZONES; i++)
4674 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4674 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4675 node_start_pfn, 4675 node_start_pfn,
4676 node_end_pfn, 4676 node_end_pfn,
4677 zones_size); 4677 zones_size);
4678 pgdat->node_spanned_pages = totalpages; 4678 pgdat->node_spanned_pages = totalpages;
4679 4679
4680 realtotalpages = totalpages; 4680 realtotalpages = totalpages;
4681 for (i = 0; i < MAX_NR_ZONES; i++) 4681 for (i = 0; i < MAX_NR_ZONES; i++)
4682 realtotalpages -= 4682 realtotalpages -=
4683 zone_absent_pages_in_node(pgdat->node_id, i, 4683 zone_absent_pages_in_node(pgdat->node_id, i,
4684 node_start_pfn, node_end_pfn, 4684 node_start_pfn, node_end_pfn,
4685 zholes_size); 4685 zholes_size);
4686 pgdat->node_present_pages = realtotalpages; 4686 pgdat->node_present_pages = realtotalpages;
4687 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4687 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4688 realtotalpages); 4688 realtotalpages);
4689 } 4689 }
4690 4690
4691 #ifndef CONFIG_SPARSEMEM 4691 #ifndef CONFIG_SPARSEMEM
4692 /* 4692 /*
4693 * Calculate the size of the zone->blockflags rounded to an unsigned long 4693 * Calculate the size of the zone->blockflags rounded to an unsigned long
4694 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4694 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4695 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4695 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4696 * round what is now in bits to nearest long in bits, then return it in 4696 * round what is now in bits to nearest long in bits, then return it in
4697 * bytes. 4697 * bytes.
4698 */ 4698 */
4699 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4699 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4700 { 4700 {
4701 unsigned long usemapsize; 4701 unsigned long usemapsize;
4702 4702
4703 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4703 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4704 usemapsize = roundup(zonesize, pageblock_nr_pages); 4704 usemapsize = roundup(zonesize, pageblock_nr_pages);
4705 usemapsize = usemapsize >> pageblock_order; 4705 usemapsize = usemapsize >> pageblock_order;
4706 usemapsize *= NR_PAGEBLOCK_BITS; 4706 usemapsize *= NR_PAGEBLOCK_BITS;
4707 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4707 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4708 4708
4709 return usemapsize / 8; 4709 return usemapsize / 8;
4710 } 4710 }
4711 4711
4712 static void __init setup_usemap(struct pglist_data *pgdat, 4712 static void __init setup_usemap(struct pglist_data *pgdat,
4713 struct zone *zone, 4713 struct zone *zone,
4714 unsigned long zone_start_pfn, 4714 unsigned long zone_start_pfn,
4715 unsigned long zonesize) 4715 unsigned long zonesize)
4716 { 4716 {
4717 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4717 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4718 zone->pageblock_flags = NULL; 4718 zone->pageblock_flags = NULL;
4719 if (usemapsize) 4719 if (usemapsize)
4720 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4720 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4721 usemapsize); 4721 usemapsize);
4722 } 4722 }
4723 #else 4723 #else
4724 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4724 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4725 unsigned long zone_start_pfn, unsigned long zonesize) {} 4725 unsigned long zone_start_pfn, unsigned long zonesize) {}
4726 #endif /* CONFIG_SPARSEMEM */ 4726 #endif /* CONFIG_SPARSEMEM */
4727 4727
4728 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4728 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4729 4729
4730 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4730 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4731 void __paginginit set_pageblock_order(void) 4731 void __paginginit set_pageblock_order(void)
4732 { 4732 {
4733 unsigned int order; 4733 unsigned int order;
4734 4734
4735 /* Check that pageblock_nr_pages has not already been setup */ 4735 /* Check that pageblock_nr_pages has not already been setup */
4736 if (pageblock_order) 4736 if (pageblock_order)
4737 return; 4737 return;
4738 4738
4739 if (HPAGE_SHIFT > PAGE_SHIFT) 4739 if (HPAGE_SHIFT > PAGE_SHIFT)
4740 order = HUGETLB_PAGE_ORDER; 4740 order = HUGETLB_PAGE_ORDER;
4741 else 4741 else
4742 order = MAX_ORDER - 1; 4742 order = MAX_ORDER - 1;
4743 4743
4744 /* 4744 /*
4745 * Assume the largest contiguous order of interest is a huge page. 4745 * Assume the largest contiguous order of interest is a huge page.
4746 * This value may be variable depending on boot parameters on IA64 and 4746 * This value may be variable depending on boot parameters on IA64 and
4747 * powerpc. 4747 * powerpc.
4748 */ 4748 */
4749 pageblock_order = order; 4749 pageblock_order = order;
4750 } 4750 }
4751 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4751 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4752 4752
4753 /* 4753 /*
4754 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4754 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4755 * is unused as pageblock_order is set at compile-time. See 4755 * is unused as pageblock_order is set at compile-time. See
4756 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4756 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4757 * the kernel config 4757 * the kernel config
4758 */ 4758 */
4759 void __paginginit set_pageblock_order(void) 4759 void __paginginit set_pageblock_order(void)
4760 { 4760 {
4761 } 4761 }
4762 4762
4763 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4763 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4764 4764
4765 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4765 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4766 unsigned long present_pages) 4766 unsigned long present_pages)
4767 { 4767 {
4768 unsigned long pages = spanned_pages; 4768 unsigned long pages = spanned_pages;
4769 4769
4770 /* 4770 /*
4771 * Provide a more accurate estimation if there are holes within 4771 * Provide a more accurate estimation if there are holes within
4772 * the zone and SPARSEMEM is in use. If there are holes within the 4772 * the zone and SPARSEMEM is in use. If there are holes within the
4773 * zone, each populated memory region may cost us one or two extra 4773 * zone, each populated memory region may cost us one or two extra
4774 * memmap pages due to alignment because memmap pages for each 4774 * memmap pages due to alignment because memmap pages for each
4775 * populated regions may not naturally algined on page boundary. 4775 * populated regions may not naturally algined on page boundary.
4776 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4776 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4777 */ 4777 */
4778 if (spanned_pages > present_pages + (present_pages >> 4) && 4778 if (spanned_pages > present_pages + (present_pages >> 4) &&
4779 IS_ENABLED(CONFIG_SPARSEMEM)) 4779 IS_ENABLED(CONFIG_SPARSEMEM))
4780 pages = present_pages; 4780 pages = present_pages;
4781 4781
4782 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4782 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4783 } 4783 }
4784 4784
4785 /* 4785 /*
4786 * Set up the zone data structures: 4786 * Set up the zone data structures:
4787 * - mark all pages reserved 4787 * - mark all pages reserved
4788 * - mark all memory queues empty 4788 * - mark all memory queues empty
4789 * - clear the memory bitmaps 4789 * - clear the memory bitmaps
4790 * 4790 *
4791 * NOTE: pgdat should get zeroed by caller. 4791 * NOTE: pgdat should get zeroed by caller.
4792 */ 4792 */
4793 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4793 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4794 unsigned long node_start_pfn, unsigned long node_end_pfn, 4794 unsigned long node_start_pfn, unsigned long node_end_pfn,
4795 unsigned long *zones_size, unsigned long *zholes_size) 4795 unsigned long *zones_size, unsigned long *zholes_size)
4796 { 4796 {
4797 enum zone_type j; 4797 enum zone_type j;
4798 int nid = pgdat->node_id; 4798 int nid = pgdat->node_id;
4799 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4799 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4800 int ret; 4800 int ret;
4801 4801
4802 pgdat_resize_init(pgdat); 4802 pgdat_resize_init(pgdat);
4803 #ifdef CONFIG_NUMA_BALANCING 4803 #ifdef CONFIG_NUMA_BALANCING
4804 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4804 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4805 pgdat->numabalancing_migrate_nr_pages = 0; 4805 pgdat->numabalancing_migrate_nr_pages = 0;
4806 pgdat->numabalancing_migrate_next_window = jiffies; 4806 pgdat->numabalancing_migrate_next_window = jiffies;
4807 #endif 4807 #endif
4808 init_waitqueue_head(&pgdat->kswapd_wait); 4808 init_waitqueue_head(&pgdat->kswapd_wait);
4809 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4809 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4810 pgdat_page_cgroup_init(pgdat); 4810 pgdat_page_cgroup_init(pgdat);
4811 4811
4812 for (j = 0; j < MAX_NR_ZONES; j++) { 4812 for (j = 0; j < MAX_NR_ZONES; j++) {
4813 struct zone *zone = pgdat->node_zones + j; 4813 struct zone *zone = pgdat->node_zones + j;
4814 unsigned long size, realsize, freesize, memmap_pages; 4814 unsigned long size, realsize, freesize, memmap_pages;
4815 4815
4816 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4816 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4817 node_end_pfn, zones_size); 4817 node_end_pfn, zones_size);
4818 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4818 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4819 node_start_pfn, 4819 node_start_pfn,
4820 node_end_pfn, 4820 node_end_pfn,
4821 zholes_size); 4821 zholes_size);
4822 4822
4823 /* 4823 /*
4824 * Adjust freesize so that it accounts for how much memory 4824 * Adjust freesize so that it accounts for how much memory
4825 * is used by this zone for memmap. This affects the watermark 4825 * is used by this zone for memmap. This affects the watermark
4826 * and per-cpu initialisations 4826 * and per-cpu initialisations
4827 */ 4827 */
4828 memmap_pages = calc_memmap_size(size, realsize); 4828 memmap_pages = calc_memmap_size(size, realsize);
4829 if (freesize >= memmap_pages) { 4829 if (freesize >= memmap_pages) {
4830 freesize -= memmap_pages; 4830 freesize -= memmap_pages;
4831 if (memmap_pages) 4831 if (memmap_pages)
4832 printk(KERN_DEBUG 4832 printk(KERN_DEBUG
4833 " %s zone: %lu pages used for memmap\n", 4833 " %s zone: %lu pages used for memmap\n",
4834 zone_names[j], memmap_pages); 4834 zone_names[j], memmap_pages);
4835 } else 4835 } else
4836 printk(KERN_WARNING 4836 printk(KERN_WARNING
4837 " %s zone: %lu pages exceeds freesize %lu\n", 4837 " %s zone: %lu pages exceeds freesize %lu\n",
4838 zone_names[j], memmap_pages, freesize); 4838 zone_names[j], memmap_pages, freesize);
4839 4839
4840 /* Account for reserved pages */ 4840 /* Account for reserved pages */
4841 if (j == 0 && freesize > dma_reserve) { 4841 if (j == 0 && freesize > dma_reserve) {
4842 freesize -= dma_reserve; 4842 freesize -= dma_reserve;
4843 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4843 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4844 zone_names[0], dma_reserve); 4844 zone_names[0], dma_reserve);
4845 } 4845 }
4846 4846
4847 if (!is_highmem_idx(j)) 4847 if (!is_highmem_idx(j))
4848 nr_kernel_pages += freesize; 4848 nr_kernel_pages += freesize;
4849 /* Charge for highmem memmap if there are enough kernel pages */ 4849 /* Charge for highmem memmap if there are enough kernel pages */
4850 else if (nr_kernel_pages > memmap_pages * 2) 4850 else if (nr_kernel_pages > memmap_pages * 2)
4851 nr_kernel_pages -= memmap_pages; 4851 nr_kernel_pages -= memmap_pages;
4852 nr_all_pages += freesize; 4852 nr_all_pages += freesize;
4853 4853
4854 zone->spanned_pages = size; 4854 zone->spanned_pages = size;
4855 zone->present_pages = realsize; 4855 zone->present_pages = realsize;
4856 /* 4856 /*
4857 * Set an approximate value for lowmem here, it will be adjusted 4857 * Set an approximate value for lowmem here, it will be adjusted
4858 * when the bootmem allocator frees pages into the buddy system. 4858 * when the bootmem allocator frees pages into the buddy system.
4859 * And all highmem pages will be managed by the buddy system. 4859 * And all highmem pages will be managed by the buddy system.
4860 */ 4860 */
4861 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4861 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4862 #ifdef CONFIG_NUMA 4862 #ifdef CONFIG_NUMA
4863 zone->node = nid; 4863 zone->node = nid;
4864 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4864 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4865 / 100; 4865 / 100;
4866 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4866 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4867 #endif 4867 #endif
4868 zone->name = zone_names[j]; 4868 zone->name = zone_names[j];
4869 spin_lock_init(&zone->lock); 4869 spin_lock_init(&zone->lock);
4870 spin_lock_init(&zone->lru_lock); 4870 spin_lock_init(&zone->lru_lock);
4871 zone_seqlock_init(zone); 4871 zone_seqlock_init(zone);
4872 zone->zone_pgdat = pgdat; 4872 zone->zone_pgdat = pgdat;
4873 zone_pcp_init(zone); 4873 zone_pcp_init(zone);
4874 4874
4875 /* For bootup, initialized properly in watermark setup */ 4875 /* For bootup, initialized properly in watermark setup */
4876 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4876 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4877 4877
4878 lruvec_init(&zone->lruvec); 4878 lruvec_init(&zone->lruvec);
4879 if (!size) 4879 if (!size)
4880 continue; 4880 continue;
4881 4881
4882 set_pageblock_order(); 4882 set_pageblock_order();
4883 setup_usemap(pgdat, zone, zone_start_pfn, size); 4883 setup_usemap(pgdat, zone, zone_start_pfn, size);
4884 ret = init_currently_empty_zone(zone, zone_start_pfn, 4884 ret = init_currently_empty_zone(zone, zone_start_pfn,
4885 size, MEMMAP_EARLY); 4885 size, MEMMAP_EARLY);
4886 BUG_ON(ret); 4886 BUG_ON(ret);
4887 memmap_init(size, nid, j, zone_start_pfn); 4887 memmap_init(size, nid, j, zone_start_pfn);
4888 zone_start_pfn += size; 4888 zone_start_pfn += size;
4889 } 4889 }
4890 } 4890 }
4891 4891
4892 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4892 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4893 { 4893 {
4894 /* Skip empty nodes */ 4894 /* Skip empty nodes */
4895 if (!pgdat->node_spanned_pages) 4895 if (!pgdat->node_spanned_pages)
4896 return; 4896 return;
4897 4897
4898 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4898 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4899 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4899 /* ia64 gets its own node_mem_map, before this, without bootmem */
4900 if (!pgdat->node_mem_map) { 4900 if (!pgdat->node_mem_map) {
4901 unsigned long size, start, end; 4901 unsigned long size, start, end;
4902 struct page *map; 4902 struct page *map;
4903 4903
4904 /* 4904 /*
4905 * The zone's endpoints aren't required to be MAX_ORDER 4905 * The zone's endpoints aren't required to be MAX_ORDER
4906 * aligned but the node_mem_map endpoints must be in order 4906 * aligned but the node_mem_map endpoints must be in order
4907 * for the buddy allocator to function correctly. 4907 * for the buddy allocator to function correctly.
4908 */ 4908 */
4909 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4909 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4910 end = pgdat_end_pfn(pgdat); 4910 end = pgdat_end_pfn(pgdat);
4911 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4911 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4912 size = (end - start) * sizeof(struct page); 4912 size = (end - start) * sizeof(struct page);
4913 map = alloc_remap(pgdat->node_id, size); 4913 map = alloc_remap(pgdat->node_id, size);
4914 if (!map) 4914 if (!map)
4915 map = alloc_bootmem_node_nopanic(pgdat, size); 4915 map = alloc_bootmem_node_nopanic(pgdat, size);
4916 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4916 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4917 } 4917 }
4918 #ifndef CONFIG_NEED_MULTIPLE_NODES 4918 #ifndef CONFIG_NEED_MULTIPLE_NODES
4919 /* 4919 /*
4920 * With no DISCONTIG, the global mem_map is just set as node 0's 4920 * With no DISCONTIG, the global mem_map is just set as node 0's
4921 */ 4921 */
4922 if (pgdat == NODE_DATA(0)) { 4922 if (pgdat == NODE_DATA(0)) {
4923 mem_map = NODE_DATA(0)->node_mem_map; 4923 mem_map = NODE_DATA(0)->node_mem_map;
4924 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4924 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4925 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4925 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4926 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4926 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4927 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4927 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4928 } 4928 }
4929 #endif 4929 #endif
4930 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4930 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4931 } 4931 }
4932 4932
4933 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4933 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4934 unsigned long node_start_pfn, unsigned long *zholes_size) 4934 unsigned long node_start_pfn, unsigned long *zholes_size)
4935 { 4935 {
4936 pg_data_t *pgdat = NODE_DATA(nid); 4936 pg_data_t *pgdat = NODE_DATA(nid);
4937 unsigned long start_pfn = 0; 4937 unsigned long start_pfn = 0;
4938 unsigned long end_pfn = 0; 4938 unsigned long end_pfn = 0;
4939 4939
4940 /* pg_data_t should be reset to zero when it's allocated */ 4940 /* pg_data_t should be reset to zero when it's allocated */
4941 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4941 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4942 4942
4943 pgdat->node_id = nid; 4943 pgdat->node_id = nid;
4944 pgdat->node_start_pfn = node_start_pfn; 4944 pgdat->node_start_pfn = node_start_pfn;
4945 if (node_state(nid, N_MEMORY)) 4945 if (node_state(nid, N_MEMORY))
4946 init_zone_allows_reclaim(nid); 4946 init_zone_allows_reclaim(nid);
4947 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4947 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4948 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4948 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4949 #endif 4949 #endif
4950 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4950 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4951 zones_size, zholes_size); 4951 zones_size, zholes_size);
4952 4952
4953 alloc_node_mem_map(pgdat); 4953 alloc_node_mem_map(pgdat);
4954 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4954 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4955 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4955 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4956 nid, (unsigned long)pgdat, 4956 nid, (unsigned long)pgdat,
4957 (unsigned long)pgdat->node_mem_map); 4957 (unsigned long)pgdat->node_mem_map);
4958 #endif 4958 #endif
4959 4959
4960 free_area_init_core(pgdat, start_pfn, end_pfn, 4960 free_area_init_core(pgdat, start_pfn, end_pfn,
4961 zones_size, zholes_size); 4961 zones_size, zholes_size);
4962 } 4962 }
4963 4963
4964 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4964 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4965 4965
4966 #if MAX_NUMNODES > 1 4966 #if MAX_NUMNODES > 1
4967 /* 4967 /*
4968 * Figure out the number of possible node ids. 4968 * Figure out the number of possible node ids.
4969 */ 4969 */
4970 void __init setup_nr_node_ids(void) 4970 void __init setup_nr_node_ids(void)
4971 { 4971 {
4972 unsigned int node; 4972 unsigned int node;
4973 unsigned int highest = 0; 4973 unsigned int highest = 0;
4974 4974
4975 for_each_node_mask(node, node_possible_map) 4975 for_each_node_mask(node, node_possible_map)
4976 highest = node; 4976 highest = node;
4977 nr_node_ids = highest + 1; 4977 nr_node_ids = highest + 1;
4978 } 4978 }
4979 #endif 4979 #endif
4980 4980
4981 /** 4981 /**
4982 * node_map_pfn_alignment - determine the maximum internode alignment 4982 * node_map_pfn_alignment - determine the maximum internode alignment
4983 * 4983 *
4984 * This function should be called after node map is populated and sorted. 4984 * This function should be called after node map is populated and sorted.
4985 * It calculates the maximum power of two alignment which can distinguish 4985 * It calculates the maximum power of two alignment which can distinguish
4986 * all the nodes. 4986 * all the nodes.
4987 * 4987 *
4988 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4988 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4989 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4989 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4990 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4990 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4991 * shifted, 1GiB is enough and this function will indicate so. 4991 * shifted, 1GiB is enough and this function will indicate so.
4992 * 4992 *
4993 * This is used to test whether pfn -> nid mapping of the chosen memory 4993 * This is used to test whether pfn -> nid mapping of the chosen memory
4994 * model has fine enough granularity to avoid incorrect mapping for the 4994 * model has fine enough granularity to avoid incorrect mapping for the
4995 * populated node map. 4995 * populated node map.
4996 * 4996 *
4997 * Returns the determined alignment in pfn's. 0 if there is no alignment 4997 * Returns the determined alignment in pfn's. 0 if there is no alignment
4998 * requirement (single node). 4998 * requirement (single node).
4999 */ 4999 */
5000 unsigned long __init node_map_pfn_alignment(void) 5000 unsigned long __init node_map_pfn_alignment(void)
5001 { 5001 {
5002 unsigned long accl_mask = 0, last_end = 0; 5002 unsigned long accl_mask = 0, last_end = 0;
5003 unsigned long start, end, mask; 5003 unsigned long start, end, mask;
5004 int last_nid = -1; 5004 int last_nid = -1;
5005 int i, nid; 5005 int i, nid;
5006 5006
5007 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 5007 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
5008 if (!start || last_nid < 0 || last_nid == nid) { 5008 if (!start || last_nid < 0 || last_nid == nid) {
5009 last_nid = nid; 5009 last_nid = nid;
5010 last_end = end; 5010 last_end = end;
5011 continue; 5011 continue;
5012 } 5012 }
5013 5013
5014 /* 5014 /*
5015 * Start with a mask granular enough to pin-point to the 5015 * Start with a mask granular enough to pin-point to the
5016 * start pfn and tick off bits one-by-one until it becomes 5016 * start pfn and tick off bits one-by-one until it becomes
5017 * too coarse to separate the current node from the last. 5017 * too coarse to separate the current node from the last.
5018 */ 5018 */
5019 mask = ~((1 << __ffs(start)) - 1); 5019 mask = ~((1 << __ffs(start)) - 1);
5020 while (mask && last_end <= (start & (mask << 1))) 5020 while (mask && last_end <= (start & (mask << 1)))
5021 mask <<= 1; 5021 mask <<= 1;
5022 5022
5023 /* accumulate all internode masks */ 5023 /* accumulate all internode masks */
5024 accl_mask |= mask; 5024 accl_mask |= mask;
5025 } 5025 }
5026 5026
5027 /* convert mask to number of pages */ 5027 /* convert mask to number of pages */
5028 return ~accl_mask + 1; 5028 return ~accl_mask + 1;
5029 } 5029 }
5030 5030
5031 /* Find the lowest pfn for a node */ 5031 /* Find the lowest pfn for a node */
5032 static unsigned long __init find_min_pfn_for_node(int nid) 5032 static unsigned long __init find_min_pfn_for_node(int nid)
5033 { 5033 {
5034 unsigned long min_pfn = ULONG_MAX; 5034 unsigned long min_pfn = ULONG_MAX;
5035 unsigned long start_pfn; 5035 unsigned long start_pfn;
5036 int i; 5036 int i;
5037 5037
5038 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5038 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
5039 min_pfn = min(min_pfn, start_pfn); 5039 min_pfn = min(min_pfn, start_pfn);
5040 5040
5041 if (min_pfn == ULONG_MAX) { 5041 if (min_pfn == ULONG_MAX) {
5042 printk(KERN_WARNING 5042 printk(KERN_WARNING
5043 "Could not find start_pfn for node %d\n", nid); 5043 "Could not find start_pfn for node %d\n", nid);
5044 return 0; 5044 return 0;
5045 } 5045 }
5046 5046
5047 return min_pfn; 5047 return min_pfn;
5048 } 5048 }
5049 5049
5050 /** 5050 /**
5051 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5051 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5052 * 5052 *
5053 * It returns the minimum PFN based on information provided via 5053 * It returns the minimum PFN based on information provided via
5054 * add_active_range(). 5054 * add_active_range().
5055 */ 5055 */
5056 unsigned long __init find_min_pfn_with_active_regions(void) 5056 unsigned long __init find_min_pfn_with_active_regions(void)
5057 { 5057 {
5058 return find_min_pfn_for_node(MAX_NUMNODES); 5058 return find_min_pfn_for_node(MAX_NUMNODES);
5059 } 5059 }
5060 5060
5061 /* 5061 /*
5062 * early_calculate_totalpages() 5062 * early_calculate_totalpages()
5063 * Sum pages in active regions for movable zone. 5063 * Sum pages in active regions for movable zone.
5064 * Populate N_MEMORY for calculating usable_nodes. 5064 * Populate N_MEMORY for calculating usable_nodes.
5065 */ 5065 */
5066 static unsigned long __init early_calculate_totalpages(void) 5066 static unsigned long __init early_calculate_totalpages(void)
5067 { 5067 {
5068 unsigned long totalpages = 0; 5068 unsigned long totalpages = 0;
5069 unsigned long start_pfn, end_pfn; 5069 unsigned long start_pfn, end_pfn;
5070 int i, nid; 5070 int i, nid;
5071 5071
5072 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5072 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5073 unsigned long pages = end_pfn - start_pfn; 5073 unsigned long pages = end_pfn - start_pfn;
5074 5074
5075 totalpages += pages; 5075 totalpages += pages;
5076 if (pages) 5076 if (pages)
5077 node_set_state(nid, N_MEMORY); 5077 node_set_state(nid, N_MEMORY);
5078 } 5078 }
5079 return totalpages; 5079 return totalpages;
5080 } 5080 }
5081 5081
5082 /* 5082 /*
5083 * Find the PFN the Movable zone begins in each node. Kernel memory 5083 * Find the PFN the Movable zone begins in each node. Kernel memory
5084 * is spread evenly between nodes as long as the nodes have enough 5084 * is spread evenly between nodes as long as the nodes have enough
5085 * memory. When they don't, some nodes will have more kernelcore than 5085 * memory. When they don't, some nodes will have more kernelcore than
5086 * others 5086 * others
5087 */ 5087 */
5088 static void __init find_zone_movable_pfns_for_nodes(void) 5088 static void __init find_zone_movable_pfns_for_nodes(void)
5089 { 5089 {
5090 int i, nid; 5090 int i, nid;
5091 unsigned long usable_startpfn; 5091 unsigned long usable_startpfn;
5092 unsigned long kernelcore_node, kernelcore_remaining; 5092 unsigned long kernelcore_node, kernelcore_remaining;
5093 /* save the state before borrow the nodemask */ 5093 /* save the state before borrow the nodemask */
5094 nodemask_t saved_node_state = node_states[N_MEMORY]; 5094 nodemask_t saved_node_state = node_states[N_MEMORY];
5095 unsigned long totalpages = early_calculate_totalpages(); 5095 unsigned long totalpages = early_calculate_totalpages();
5096 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5096 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5097 5097
5098 /* 5098 /*
5099 * If movablecore was specified, calculate what size of 5099 * If movablecore was specified, calculate what size of
5100 * kernelcore that corresponds so that memory usable for 5100 * kernelcore that corresponds so that memory usable for
5101 * any allocation type is evenly spread. If both kernelcore 5101 * any allocation type is evenly spread. If both kernelcore
5102 * and movablecore are specified, then the value of kernelcore 5102 * and movablecore are specified, then the value of kernelcore
5103 * will be used for required_kernelcore if it's greater than 5103 * will be used for required_kernelcore if it's greater than
5104 * what movablecore would have allowed. 5104 * what movablecore would have allowed.
5105 */ 5105 */
5106 if (required_movablecore) { 5106 if (required_movablecore) {
5107 unsigned long corepages; 5107 unsigned long corepages;
5108 5108
5109 /* 5109 /*
5110 * Round-up so that ZONE_MOVABLE is at least as large as what 5110 * Round-up so that ZONE_MOVABLE is at least as large as what
5111 * was requested by the user 5111 * was requested by the user
5112 */ 5112 */
5113 required_movablecore = 5113 required_movablecore =
5114 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5114 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5115 corepages = totalpages - required_movablecore; 5115 corepages = totalpages - required_movablecore;
5116 5116
5117 required_kernelcore = max(required_kernelcore, corepages); 5117 required_kernelcore = max(required_kernelcore, corepages);
5118 } 5118 }
5119 5119
5120 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5120 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5121 if (!required_kernelcore) 5121 if (!required_kernelcore)
5122 goto out; 5122 goto out;
5123 5123
5124 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5124 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5125 find_usable_zone_for_movable(); 5125 find_usable_zone_for_movable();
5126 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5126 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5127 5127
5128 restart: 5128 restart:
5129 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5129 /* Spread kernelcore memory as evenly as possible throughout nodes */
5130 kernelcore_node = required_kernelcore / usable_nodes; 5130 kernelcore_node = required_kernelcore / usable_nodes;
5131 for_each_node_state(nid, N_MEMORY) { 5131 for_each_node_state(nid, N_MEMORY) {
5132 unsigned long start_pfn, end_pfn; 5132 unsigned long start_pfn, end_pfn;
5133 5133
5134 /* 5134 /*
5135 * Recalculate kernelcore_node if the division per node 5135 * Recalculate kernelcore_node if the division per node
5136 * now exceeds what is necessary to satisfy the requested 5136 * now exceeds what is necessary to satisfy the requested
5137 * amount of memory for the kernel 5137 * amount of memory for the kernel
5138 */ 5138 */
5139 if (required_kernelcore < kernelcore_node) 5139 if (required_kernelcore < kernelcore_node)
5140 kernelcore_node = required_kernelcore / usable_nodes; 5140 kernelcore_node = required_kernelcore / usable_nodes;
5141 5141
5142 /* 5142 /*
5143 * As the map is walked, we track how much memory is usable 5143 * As the map is walked, we track how much memory is usable
5144 * by the kernel using kernelcore_remaining. When it is 5144 * by the kernel using kernelcore_remaining. When it is
5145 * 0, the rest of the node is usable by ZONE_MOVABLE 5145 * 0, the rest of the node is usable by ZONE_MOVABLE
5146 */ 5146 */
5147 kernelcore_remaining = kernelcore_node; 5147 kernelcore_remaining = kernelcore_node;
5148 5148
5149 /* Go through each range of PFNs within this node */ 5149 /* Go through each range of PFNs within this node */
5150 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5150 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5151 unsigned long size_pages; 5151 unsigned long size_pages;
5152 5152
5153 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5153 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5154 if (start_pfn >= end_pfn) 5154 if (start_pfn >= end_pfn)
5155 continue; 5155 continue;
5156 5156
5157 /* Account for what is only usable for kernelcore */ 5157 /* Account for what is only usable for kernelcore */
5158 if (start_pfn < usable_startpfn) { 5158 if (start_pfn < usable_startpfn) {
5159 unsigned long kernel_pages; 5159 unsigned long kernel_pages;
5160 kernel_pages = min(end_pfn, usable_startpfn) 5160 kernel_pages = min(end_pfn, usable_startpfn)
5161 - start_pfn; 5161 - start_pfn;
5162 5162
5163 kernelcore_remaining -= min(kernel_pages, 5163 kernelcore_remaining -= min(kernel_pages,
5164 kernelcore_remaining); 5164 kernelcore_remaining);
5165 required_kernelcore -= min(kernel_pages, 5165 required_kernelcore -= min(kernel_pages,
5166 required_kernelcore); 5166 required_kernelcore);
5167 5167
5168 /* Continue if range is now fully accounted */ 5168 /* Continue if range is now fully accounted */
5169 if (end_pfn <= usable_startpfn) { 5169 if (end_pfn <= usable_startpfn) {
5170 5170
5171 /* 5171 /*
5172 * Push zone_movable_pfn to the end so 5172 * Push zone_movable_pfn to the end so
5173 * that if we have to rebalance 5173 * that if we have to rebalance
5174 * kernelcore across nodes, we will 5174 * kernelcore across nodes, we will
5175 * not double account here 5175 * not double account here
5176 */ 5176 */
5177 zone_movable_pfn[nid] = end_pfn; 5177 zone_movable_pfn[nid] = end_pfn;
5178 continue; 5178 continue;
5179 } 5179 }
5180 start_pfn = usable_startpfn; 5180 start_pfn = usable_startpfn;
5181 } 5181 }
5182 5182
5183 /* 5183 /*
5184 * The usable PFN range for ZONE_MOVABLE is from 5184 * The usable PFN range for ZONE_MOVABLE is from
5185 * start_pfn->end_pfn. Calculate size_pages as the 5185 * start_pfn->end_pfn. Calculate size_pages as the
5186 * number of pages used as kernelcore 5186 * number of pages used as kernelcore
5187 */ 5187 */
5188 size_pages = end_pfn - start_pfn; 5188 size_pages = end_pfn - start_pfn;
5189 if (size_pages > kernelcore_remaining) 5189 if (size_pages > kernelcore_remaining)
5190 size_pages = kernelcore_remaining; 5190 size_pages = kernelcore_remaining;
5191 zone_movable_pfn[nid] = start_pfn + size_pages; 5191 zone_movable_pfn[nid] = start_pfn + size_pages;
5192 5192
5193 /* 5193 /*
5194 * Some kernelcore has been met, update counts and 5194 * Some kernelcore has been met, update counts and
5195 * break if the kernelcore for this node has been 5195 * break if the kernelcore for this node has been
5196 * satisfied 5196 * satisfied
5197 */ 5197 */
5198 required_kernelcore -= min(required_kernelcore, 5198 required_kernelcore -= min(required_kernelcore,
5199 size_pages); 5199 size_pages);
5200 kernelcore_remaining -= size_pages; 5200 kernelcore_remaining -= size_pages;
5201 if (!kernelcore_remaining) 5201 if (!kernelcore_remaining)
5202 break; 5202 break;
5203 } 5203 }
5204 } 5204 }
5205 5205
5206 /* 5206 /*
5207 * If there is still required_kernelcore, we do another pass with one 5207 * If there is still required_kernelcore, we do another pass with one
5208 * less node in the count. This will push zone_movable_pfn[nid] further 5208 * less node in the count. This will push zone_movable_pfn[nid] further
5209 * along on the nodes that still have memory until kernelcore is 5209 * along on the nodes that still have memory until kernelcore is
5210 * satisfied 5210 * satisfied
5211 */ 5211 */
5212 usable_nodes--; 5212 usable_nodes--;
5213 if (usable_nodes && required_kernelcore > usable_nodes) 5213 if (usable_nodes && required_kernelcore > usable_nodes)
5214 goto restart; 5214 goto restart;
5215 5215
5216 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5216 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5217 for (nid = 0; nid < MAX_NUMNODES; nid++) 5217 for (nid = 0; nid < MAX_NUMNODES; nid++)
5218 zone_movable_pfn[nid] = 5218 zone_movable_pfn[nid] =
5219 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5219 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5220 5220
5221 out: 5221 out:
5222 /* restore the node_state */ 5222 /* restore the node_state */
5223 node_states[N_MEMORY] = saved_node_state; 5223 node_states[N_MEMORY] = saved_node_state;
5224 } 5224 }
5225 5225
5226 /* Any regular or high memory on that node ? */ 5226 /* Any regular or high memory on that node ? */
5227 static void check_for_memory(pg_data_t *pgdat, int nid) 5227 static void check_for_memory(pg_data_t *pgdat, int nid)
5228 { 5228 {
5229 enum zone_type zone_type; 5229 enum zone_type zone_type;
5230 5230
5231 if (N_MEMORY == N_NORMAL_MEMORY) 5231 if (N_MEMORY == N_NORMAL_MEMORY)
5232 return; 5232 return;
5233 5233
5234 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5234 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5235 struct zone *zone = &pgdat->node_zones[zone_type]; 5235 struct zone *zone = &pgdat->node_zones[zone_type];
5236 if (zone->present_pages) { 5236 if (zone->present_pages) {
5237 node_set_state(nid, N_HIGH_MEMORY); 5237 node_set_state(nid, N_HIGH_MEMORY);
5238 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5238 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5239 zone_type <= ZONE_NORMAL) 5239 zone_type <= ZONE_NORMAL)
5240 node_set_state(nid, N_NORMAL_MEMORY); 5240 node_set_state(nid, N_NORMAL_MEMORY);
5241 break; 5241 break;
5242 } 5242 }
5243 } 5243 }
5244 } 5244 }
5245 5245
5246 /** 5246 /**
5247 * free_area_init_nodes - Initialise all pg_data_t and zone data 5247 * free_area_init_nodes - Initialise all pg_data_t and zone data
5248 * @max_zone_pfn: an array of max PFNs for each zone 5248 * @max_zone_pfn: an array of max PFNs for each zone
5249 * 5249 *
5250 * This will call free_area_init_node() for each active node in the system. 5250 * This will call free_area_init_node() for each active node in the system.
5251 * Using the page ranges provided by add_active_range(), the size of each 5251 * Using the page ranges provided by add_active_range(), the size of each
5252 * zone in each node and their holes is calculated. If the maximum PFN 5252 * zone in each node and their holes is calculated. If the maximum PFN
5253 * between two adjacent zones match, it is assumed that the zone is empty. 5253 * between two adjacent zones match, it is assumed that the zone is empty.
5254 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5254 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5255 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5255 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5256 * starts where the previous one ended. For example, ZONE_DMA32 starts 5256 * starts where the previous one ended. For example, ZONE_DMA32 starts
5257 * at arch_max_dma_pfn. 5257 * at arch_max_dma_pfn.
5258 */ 5258 */
5259 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5259 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5260 { 5260 {
5261 unsigned long start_pfn, end_pfn; 5261 unsigned long start_pfn, end_pfn;
5262 int i, nid; 5262 int i, nid;
5263 5263
5264 /* Record where the zone boundaries are */ 5264 /* Record where the zone boundaries are */
5265 memset(arch_zone_lowest_possible_pfn, 0, 5265 memset(arch_zone_lowest_possible_pfn, 0,
5266 sizeof(arch_zone_lowest_possible_pfn)); 5266 sizeof(arch_zone_lowest_possible_pfn));
5267 memset(arch_zone_highest_possible_pfn, 0, 5267 memset(arch_zone_highest_possible_pfn, 0,
5268 sizeof(arch_zone_highest_possible_pfn)); 5268 sizeof(arch_zone_highest_possible_pfn));
5269 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5269 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5270 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5270 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5271 for (i = 1; i < MAX_NR_ZONES; i++) { 5271 for (i = 1; i < MAX_NR_ZONES; i++) {
5272 if (i == ZONE_MOVABLE) 5272 if (i == ZONE_MOVABLE)
5273 continue; 5273 continue;
5274 arch_zone_lowest_possible_pfn[i] = 5274 arch_zone_lowest_possible_pfn[i] =
5275 arch_zone_highest_possible_pfn[i-1]; 5275 arch_zone_highest_possible_pfn[i-1];
5276 arch_zone_highest_possible_pfn[i] = 5276 arch_zone_highest_possible_pfn[i] =
5277 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5277 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5278 } 5278 }
5279 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5279 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5280 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5280 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5281 5281
5282 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5282 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5283 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5283 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5284 find_zone_movable_pfns_for_nodes(); 5284 find_zone_movable_pfns_for_nodes();
5285 5285
5286 /* Print out the zone ranges */ 5286 /* Print out the zone ranges */
5287 printk("Zone ranges:\n"); 5287 printk("Zone ranges:\n");
5288 for (i = 0; i < MAX_NR_ZONES; i++) { 5288 for (i = 0; i < MAX_NR_ZONES; i++) {
5289 if (i == ZONE_MOVABLE) 5289 if (i == ZONE_MOVABLE)
5290 continue; 5290 continue;
5291 printk(KERN_CONT " %-8s ", zone_names[i]); 5291 printk(KERN_CONT " %-8s ", zone_names[i]);
5292 if (arch_zone_lowest_possible_pfn[i] == 5292 if (arch_zone_lowest_possible_pfn[i] ==
5293 arch_zone_highest_possible_pfn[i]) 5293 arch_zone_highest_possible_pfn[i])
5294 printk(KERN_CONT "empty\n"); 5294 printk(KERN_CONT "empty\n");
5295 else 5295 else
5296 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5296 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5297 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5297 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5298 (arch_zone_highest_possible_pfn[i] 5298 (arch_zone_highest_possible_pfn[i]
5299 << PAGE_SHIFT) - 1); 5299 << PAGE_SHIFT) - 1);
5300 } 5300 }
5301 5301
5302 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5302 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5303 printk("Movable zone start for each node\n"); 5303 printk("Movable zone start for each node\n");
5304 for (i = 0; i < MAX_NUMNODES; i++) { 5304 for (i = 0; i < MAX_NUMNODES; i++) {
5305 if (zone_movable_pfn[i]) 5305 if (zone_movable_pfn[i])
5306 printk(" Node %d: %#010lx\n", i, 5306 printk(" Node %d: %#010lx\n", i,
5307 zone_movable_pfn[i] << PAGE_SHIFT); 5307 zone_movable_pfn[i] << PAGE_SHIFT);
5308 } 5308 }
5309 5309
5310 /* Print out the early node map */ 5310 /* Print out the early node map */
5311 printk("Early memory node ranges\n"); 5311 printk("Early memory node ranges\n");
5312 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5312 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5313 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5313 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5314 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5314 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5315 5315
5316 /* Initialise every node */ 5316 /* Initialise every node */
5317 mminit_verify_pageflags_layout(); 5317 mminit_verify_pageflags_layout();
5318 setup_nr_node_ids(); 5318 setup_nr_node_ids();
5319 for_each_online_node(nid) { 5319 for_each_online_node(nid) {
5320 pg_data_t *pgdat = NODE_DATA(nid); 5320 pg_data_t *pgdat = NODE_DATA(nid);
5321 free_area_init_node(nid, NULL, 5321 free_area_init_node(nid, NULL,
5322 find_min_pfn_for_node(nid), NULL); 5322 find_min_pfn_for_node(nid), NULL);
5323 5323
5324 /* Any memory on that node */ 5324 /* Any memory on that node */
5325 if (pgdat->node_present_pages) 5325 if (pgdat->node_present_pages)
5326 node_set_state(nid, N_MEMORY); 5326 node_set_state(nid, N_MEMORY);
5327 check_for_memory(pgdat, nid); 5327 check_for_memory(pgdat, nid);
5328 } 5328 }
5329 } 5329 }
5330 5330
5331 static int __init cmdline_parse_core(char *p, unsigned long *core) 5331 static int __init cmdline_parse_core(char *p, unsigned long *core)
5332 { 5332 {
5333 unsigned long long coremem; 5333 unsigned long long coremem;
5334 if (!p) 5334 if (!p)
5335 return -EINVAL; 5335 return -EINVAL;
5336 5336
5337 coremem = memparse(p, &p); 5337 coremem = memparse(p, &p);
5338 *core = coremem >> PAGE_SHIFT; 5338 *core = coremem >> PAGE_SHIFT;
5339 5339
5340 /* Paranoid check that UL is enough for the coremem value */ 5340 /* Paranoid check that UL is enough for the coremem value */
5341 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5341 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5342 5342
5343 return 0; 5343 return 0;
5344 } 5344 }
5345 5345
5346 /* 5346 /*
5347 * kernelcore=size sets the amount of memory for use for allocations that 5347 * kernelcore=size sets the amount of memory for use for allocations that
5348 * cannot be reclaimed or migrated. 5348 * cannot be reclaimed or migrated.
5349 */ 5349 */
5350 static int __init cmdline_parse_kernelcore(char *p) 5350 static int __init cmdline_parse_kernelcore(char *p)
5351 { 5351 {
5352 return cmdline_parse_core(p, &required_kernelcore); 5352 return cmdline_parse_core(p, &required_kernelcore);
5353 } 5353 }
5354 5354
5355 /* 5355 /*
5356 * movablecore=size sets the amount of memory for use for allocations that 5356 * movablecore=size sets the amount of memory for use for allocations that
5357 * can be reclaimed or migrated. 5357 * can be reclaimed or migrated.
5358 */ 5358 */
5359 static int __init cmdline_parse_movablecore(char *p) 5359 static int __init cmdline_parse_movablecore(char *p)
5360 { 5360 {
5361 return cmdline_parse_core(p, &required_movablecore); 5361 return cmdline_parse_core(p, &required_movablecore);
5362 } 5362 }
5363 5363
5364 early_param("kernelcore", cmdline_parse_kernelcore); 5364 early_param("kernelcore", cmdline_parse_kernelcore);
5365 early_param("movablecore", cmdline_parse_movablecore); 5365 early_param("movablecore", cmdline_parse_movablecore);
5366 5366
5367 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5367 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5368 5368
5369 void adjust_managed_page_count(struct page *page, long count) 5369 void adjust_managed_page_count(struct page *page, long count)
5370 { 5370 {
5371 spin_lock(&managed_page_count_lock); 5371 spin_lock(&managed_page_count_lock);
5372 page_zone(page)->managed_pages += count; 5372 page_zone(page)->managed_pages += count;
5373 totalram_pages += count; 5373 totalram_pages += count;
5374 #ifdef CONFIG_HIGHMEM 5374 #ifdef CONFIG_HIGHMEM
5375 if (PageHighMem(page)) 5375 if (PageHighMem(page))
5376 totalhigh_pages += count; 5376 totalhigh_pages += count;
5377 #endif 5377 #endif
5378 spin_unlock(&managed_page_count_lock); 5378 spin_unlock(&managed_page_count_lock);
5379 } 5379 }
5380 EXPORT_SYMBOL(adjust_managed_page_count); 5380 EXPORT_SYMBOL(adjust_managed_page_count);
5381 5381
5382 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5382 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5383 { 5383 {
5384 void *pos; 5384 void *pos;
5385 unsigned long pages = 0; 5385 unsigned long pages = 0;
5386 5386
5387 start = (void *)PAGE_ALIGN((unsigned long)start); 5387 start = (void *)PAGE_ALIGN((unsigned long)start);
5388 end = (void *)((unsigned long)end & PAGE_MASK); 5388 end = (void *)((unsigned long)end & PAGE_MASK);
5389 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5389 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5390 if ((unsigned int)poison <= 0xFF) 5390 if ((unsigned int)poison <= 0xFF)
5391 memset(pos, poison, PAGE_SIZE); 5391 memset(pos, poison, PAGE_SIZE);
5392 free_reserved_page(virt_to_page(pos)); 5392 free_reserved_page(virt_to_page(pos));
5393 } 5393 }
5394 5394
5395 if (pages && s) 5395 if (pages && s)
5396 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5396 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5397 s, pages << (PAGE_SHIFT - 10), start, end); 5397 s, pages << (PAGE_SHIFT - 10), start, end);
5398 5398
5399 return pages; 5399 return pages;
5400 } 5400 }
5401 EXPORT_SYMBOL(free_reserved_area); 5401 EXPORT_SYMBOL(free_reserved_area);
5402 5402
5403 #ifdef CONFIG_HIGHMEM 5403 #ifdef CONFIG_HIGHMEM
5404 void free_highmem_page(struct page *page) 5404 void free_highmem_page(struct page *page)
5405 { 5405 {
5406 __free_reserved_page(page); 5406 __free_reserved_page(page);
5407 totalram_pages++; 5407 totalram_pages++;
5408 page_zone(page)->managed_pages++; 5408 page_zone(page)->managed_pages++;
5409 totalhigh_pages++; 5409 totalhigh_pages++;
5410 } 5410 }
5411 #endif 5411 #endif
5412 5412
5413 5413
5414 void __init mem_init_print_info(const char *str) 5414 void __init mem_init_print_info(const char *str)
5415 { 5415 {
5416 unsigned long physpages, codesize, datasize, rosize, bss_size; 5416 unsigned long physpages, codesize, datasize, rosize, bss_size;
5417 unsigned long init_code_size, init_data_size; 5417 unsigned long init_code_size, init_data_size;
5418 5418
5419 physpages = get_num_physpages(); 5419 physpages = get_num_physpages();
5420 codesize = _etext - _stext; 5420 codesize = _etext - _stext;
5421 datasize = _edata - _sdata; 5421 datasize = _edata - _sdata;
5422 rosize = __end_rodata - __start_rodata; 5422 rosize = __end_rodata - __start_rodata;
5423 bss_size = __bss_stop - __bss_start; 5423 bss_size = __bss_stop - __bss_start;
5424 init_data_size = __init_end - __init_begin; 5424 init_data_size = __init_end - __init_begin;
5425 init_code_size = _einittext - _sinittext; 5425 init_code_size = _einittext - _sinittext;
5426 5426
5427 /* 5427 /*
5428 * Detect special cases and adjust section sizes accordingly: 5428 * Detect special cases and adjust section sizes accordingly:
5429 * 1) .init.* may be embedded into .data sections 5429 * 1) .init.* may be embedded into .data sections
5430 * 2) .init.text.* may be out of [__init_begin, __init_end], 5430 * 2) .init.text.* may be out of [__init_begin, __init_end],
5431 * please refer to arch/tile/kernel/vmlinux.lds.S. 5431 * please refer to arch/tile/kernel/vmlinux.lds.S.
5432 * 3) .rodata.* may be embedded into .text or .data sections. 5432 * 3) .rodata.* may be embedded into .text or .data sections.
5433 */ 5433 */
5434 #define adj_init_size(start, end, size, pos, adj) \ 5434 #define adj_init_size(start, end, size, pos, adj) \
5435 do { \ 5435 do { \
5436 if (start <= pos && pos < end && size > adj) \ 5436 if (start <= pos && pos < end && size > adj) \
5437 size -= adj; \ 5437 size -= adj; \
5438 } while (0) 5438 } while (0)
5439 5439
5440 adj_init_size(__init_begin, __init_end, init_data_size, 5440 adj_init_size(__init_begin, __init_end, init_data_size,
5441 _sinittext, init_code_size); 5441 _sinittext, init_code_size);
5442 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5442 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5443 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5443 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5444 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5444 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5445 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5445 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5446 5446
5447 #undef adj_init_size 5447 #undef adj_init_size
5448 5448
5449 printk("Memory: %luK/%luK available " 5449 printk("Memory: %luK/%luK available "
5450 "(%luK kernel code, %luK rwdata, %luK rodata, " 5450 "(%luK kernel code, %luK rwdata, %luK rodata, "
5451 "%luK init, %luK bss, %luK reserved" 5451 "%luK init, %luK bss, %luK reserved"
5452 #ifdef CONFIG_HIGHMEM 5452 #ifdef CONFIG_HIGHMEM
5453 ", %luK highmem" 5453 ", %luK highmem"
5454 #endif 5454 #endif
5455 "%s%s)\n", 5455 "%s%s)\n",
5456 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5456 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5457 codesize >> 10, datasize >> 10, rosize >> 10, 5457 codesize >> 10, datasize >> 10, rosize >> 10,
5458 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5458 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5459 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5459 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5460 #ifdef CONFIG_HIGHMEM 5460 #ifdef CONFIG_HIGHMEM
5461 totalhigh_pages << (PAGE_SHIFT-10), 5461 totalhigh_pages << (PAGE_SHIFT-10),
5462 #endif 5462 #endif
5463 str ? ", " : "", str ? str : ""); 5463 str ? ", " : "", str ? str : "");
5464 } 5464 }
5465 5465
5466 /** 5466 /**
5467 * set_dma_reserve - set the specified number of pages reserved in the first zone 5467 * set_dma_reserve - set the specified number of pages reserved in the first zone
5468 * @new_dma_reserve: The number of pages to mark reserved 5468 * @new_dma_reserve: The number of pages to mark reserved
5469 * 5469 *
5470 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5470 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5471 * In the DMA zone, a significant percentage may be consumed by kernel image 5471 * In the DMA zone, a significant percentage may be consumed by kernel image
5472 * and other unfreeable allocations which can skew the watermarks badly. This 5472 * and other unfreeable allocations which can skew the watermarks badly. This
5473 * function may optionally be used to account for unfreeable pages in the 5473 * function may optionally be used to account for unfreeable pages in the
5474 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5474 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5475 * smaller per-cpu batchsize. 5475 * smaller per-cpu batchsize.
5476 */ 5476 */
5477 void __init set_dma_reserve(unsigned long new_dma_reserve) 5477 void __init set_dma_reserve(unsigned long new_dma_reserve)
5478 { 5478 {
5479 dma_reserve = new_dma_reserve; 5479 dma_reserve = new_dma_reserve;
5480 } 5480 }
5481 5481
5482 void __init free_area_init(unsigned long *zones_size) 5482 void __init free_area_init(unsigned long *zones_size)
5483 { 5483 {
5484 free_area_init_node(0, zones_size, 5484 free_area_init_node(0, zones_size,
5485 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5485 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5486 } 5486 }
5487 5487
5488 static int page_alloc_cpu_notify(struct notifier_block *self, 5488 static int page_alloc_cpu_notify(struct notifier_block *self,
5489 unsigned long action, void *hcpu) 5489 unsigned long action, void *hcpu)
5490 { 5490 {
5491 int cpu = (unsigned long)hcpu; 5491 int cpu = (unsigned long)hcpu;
5492 5492
5493 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5493 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5494 lru_add_drain_cpu(cpu); 5494 lru_add_drain_cpu(cpu);
5495 drain_pages(cpu); 5495 drain_pages(cpu);
5496 5496
5497 /* 5497 /*
5498 * Spill the event counters of the dead processor 5498 * Spill the event counters of the dead processor
5499 * into the current processors event counters. 5499 * into the current processors event counters.
5500 * This artificially elevates the count of the current 5500 * This artificially elevates the count of the current
5501 * processor. 5501 * processor.
5502 */ 5502 */
5503 vm_events_fold_cpu(cpu); 5503 vm_events_fold_cpu(cpu);
5504 5504
5505 /* 5505 /*
5506 * Zero the differential counters of the dead processor 5506 * Zero the differential counters of the dead processor
5507 * so that the vm statistics are consistent. 5507 * so that the vm statistics are consistent.
5508 * 5508 *
5509 * This is only okay since the processor is dead and cannot 5509 * This is only okay since the processor is dead and cannot
5510 * race with what we are doing. 5510 * race with what we are doing.
5511 */ 5511 */
5512 cpu_vm_stats_fold(cpu); 5512 cpu_vm_stats_fold(cpu);
5513 } 5513 }
5514 return NOTIFY_OK; 5514 return NOTIFY_OK;
5515 } 5515 }
5516 5516
5517 void __init page_alloc_init(void) 5517 void __init page_alloc_init(void)
5518 { 5518 {
5519 hotcpu_notifier(page_alloc_cpu_notify, 0); 5519 hotcpu_notifier(page_alloc_cpu_notify, 0);
5520 } 5520 }
5521 5521
5522 /* 5522 /*
5523 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5523 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5524 * or min_free_kbytes changes. 5524 * or min_free_kbytes changes.
5525 */ 5525 */
5526 static void calculate_totalreserve_pages(void) 5526 static void calculate_totalreserve_pages(void)
5527 { 5527 {
5528 struct pglist_data *pgdat; 5528 struct pglist_data *pgdat;
5529 unsigned long reserve_pages = 0; 5529 unsigned long reserve_pages = 0;
5530 enum zone_type i, j; 5530 enum zone_type i, j;
5531 5531
5532 for_each_online_pgdat(pgdat) { 5532 for_each_online_pgdat(pgdat) {
5533 for (i = 0; i < MAX_NR_ZONES; i++) { 5533 for (i = 0; i < MAX_NR_ZONES; i++) {
5534 struct zone *zone = pgdat->node_zones + i; 5534 struct zone *zone = pgdat->node_zones + i;
5535 long max = 0; 5535 long max = 0;
5536 5536
5537 /* Find valid and maximum lowmem_reserve in the zone */ 5537 /* Find valid and maximum lowmem_reserve in the zone */
5538 for (j = i; j < MAX_NR_ZONES; j++) { 5538 for (j = i; j < MAX_NR_ZONES; j++) {
5539 if (zone->lowmem_reserve[j] > max) 5539 if (zone->lowmem_reserve[j] > max)
5540 max = zone->lowmem_reserve[j]; 5540 max = zone->lowmem_reserve[j];
5541 } 5541 }
5542 5542
5543 /* we treat the high watermark as reserved pages. */ 5543 /* we treat the high watermark as reserved pages. */
5544 max += high_wmark_pages(zone); 5544 max += high_wmark_pages(zone);
5545 5545
5546 if (max > zone->managed_pages) 5546 if (max > zone->managed_pages)
5547 max = zone->managed_pages; 5547 max = zone->managed_pages;
5548 reserve_pages += max; 5548 reserve_pages += max;
5549 /* 5549 /*
5550 * Lowmem reserves are not available to 5550 * Lowmem reserves are not available to
5551 * GFP_HIGHUSER page cache allocations and 5551 * GFP_HIGHUSER page cache allocations and
5552 * kswapd tries to balance zones to their high 5552 * kswapd tries to balance zones to their high
5553 * watermark. As a result, neither should be 5553 * watermark. As a result, neither should be
5554 * regarded as dirtyable memory, to prevent a 5554 * regarded as dirtyable memory, to prevent a
5555 * situation where reclaim has to clean pages 5555 * situation where reclaim has to clean pages
5556 * in order to balance the zones. 5556 * in order to balance the zones.
5557 */ 5557 */
5558 zone->dirty_balance_reserve = max; 5558 zone->dirty_balance_reserve = max;
5559 } 5559 }
5560 } 5560 }
5561 dirty_balance_reserve = reserve_pages; 5561 dirty_balance_reserve = reserve_pages;
5562 totalreserve_pages = reserve_pages; 5562 totalreserve_pages = reserve_pages;
5563 } 5563 }
5564 5564
5565 /* 5565 /*
5566 * setup_per_zone_lowmem_reserve - called whenever 5566 * setup_per_zone_lowmem_reserve - called whenever
5567 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5567 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5568 * has a correct pages reserved value, so an adequate number of 5568 * has a correct pages reserved value, so an adequate number of
5569 * pages are left in the zone after a successful __alloc_pages(). 5569 * pages are left in the zone after a successful __alloc_pages().
5570 */ 5570 */
5571 static void setup_per_zone_lowmem_reserve(void) 5571 static void setup_per_zone_lowmem_reserve(void)
5572 { 5572 {
5573 struct pglist_data *pgdat; 5573 struct pglist_data *pgdat;
5574 enum zone_type j, idx; 5574 enum zone_type j, idx;
5575 5575
5576 for_each_online_pgdat(pgdat) { 5576 for_each_online_pgdat(pgdat) {
5577 for (j = 0; j < MAX_NR_ZONES; j++) { 5577 for (j = 0; j < MAX_NR_ZONES; j++) {
5578 struct zone *zone = pgdat->node_zones + j; 5578 struct zone *zone = pgdat->node_zones + j;
5579 unsigned long managed_pages = zone->managed_pages; 5579 unsigned long managed_pages = zone->managed_pages;
5580 5580
5581 zone->lowmem_reserve[j] = 0; 5581 zone->lowmem_reserve[j] = 0;
5582 5582
5583 idx = j; 5583 idx = j;
5584 while (idx) { 5584 while (idx) {
5585 struct zone *lower_zone; 5585 struct zone *lower_zone;
5586 5586
5587 idx--; 5587 idx--;
5588 5588
5589 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5589 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5590 sysctl_lowmem_reserve_ratio[idx] = 1; 5590 sysctl_lowmem_reserve_ratio[idx] = 1;
5591 5591
5592 lower_zone = pgdat->node_zones + idx; 5592 lower_zone = pgdat->node_zones + idx;
5593 lower_zone->lowmem_reserve[j] = managed_pages / 5593 lower_zone->lowmem_reserve[j] = managed_pages /
5594 sysctl_lowmem_reserve_ratio[idx]; 5594 sysctl_lowmem_reserve_ratio[idx];
5595 managed_pages += lower_zone->managed_pages; 5595 managed_pages += lower_zone->managed_pages;
5596 } 5596 }
5597 } 5597 }
5598 } 5598 }
5599 5599
5600 /* update totalreserve_pages */ 5600 /* update totalreserve_pages */
5601 calculate_totalreserve_pages(); 5601 calculate_totalreserve_pages();
5602 } 5602 }
5603 5603
5604 static void __setup_per_zone_wmarks(void) 5604 static void __setup_per_zone_wmarks(void)
5605 { 5605 {
5606 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5606 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5607 unsigned long lowmem_pages = 0; 5607 unsigned long lowmem_pages = 0;
5608 struct zone *zone; 5608 struct zone *zone;
5609 unsigned long flags; 5609 unsigned long flags;
5610 5610
5611 /* Calculate total number of !ZONE_HIGHMEM pages */ 5611 /* Calculate total number of !ZONE_HIGHMEM pages */
5612 for_each_zone(zone) { 5612 for_each_zone(zone) {
5613 if (!is_highmem(zone)) 5613 if (!is_highmem(zone))
5614 lowmem_pages += zone->managed_pages; 5614 lowmem_pages += zone->managed_pages;
5615 } 5615 }
5616 5616
5617 for_each_zone(zone) { 5617 for_each_zone(zone) {
5618 u64 tmp; 5618 u64 tmp;
5619 5619
5620 spin_lock_irqsave(&zone->lock, flags); 5620 spin_lock_irqsave(&zone->lock, flags);
5621 tmp = (u64)pages_min * zone->managed_pages; 5621 tmp = (u64)pages_min * zone->managed_pages;
5622 do_div(tmp, lowmem_pages); 5622 do_div(tmp, lowmem_pages);
5623 if (is_highmem(zone)) { 5623 if (is_highmem(zone)) {
5624 /* 5624 /*
5625 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5625 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5626 * need highmem pages, so cap pages_min to a small 5626 * need highmem pages, so cap pages_min to a small
5627 * value here. 5627 * value here.
5628 * 5628 *
5629 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5629 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5630 * deltas controls asynch page reclaim, and so should 5630 * deltas controls asynch page reclaim, and so should
5631 * not be capped for highmem. 5631 * not be capped for highmem.
5632 */ 5632 */
5633 unsigned long min_pages; 5633 unsigned long min_pages;
5634 5634
5635 min_pages = zone->managed_pages / 1024; 5635 min_pages = zone->managed_pages / 1024;
5636 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5636 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5637 zone->watermark[WMARK_MIN] = min_pages; 5637 zone->watermark[WMARK_MIN] = min_pages;
5638 } else { 5638 } else {
5639 /* 5639 /*
5640 * If it's a lowmem zone, reserve a number of pages 5640 * If it's a lowmem zone, reserve a number of pages
5641 * proportionate to the zone's size. 5641 * proportionate to the zone's size.
5642 */ 5642 */
5643 zone->watermark[WMARK_MIN] = tmp; 5643 zone->watermark[WMARK_MIN] = tmp;
5644 } 5644 }
5645 5645
5646 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5646 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5647 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5647 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5648 5648
5649 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5649 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5650 high_wmark_pages(zone) - 5650 high_wmark_pages(zone) -
5651 low_wmark_pages(zone) - 5651 low_wmark_pages(zone) -
5652 zone_page_state(zone, NR_ALLOC_BATCH)); 5652 zone_page_state(zone, NR_ALLOC_BATCH));
5653 5653
5654 setup_zone_migrate_reserve(zone); 5654 setup_zone_migrate_reserve(zone);
5655 spin_unlock_irqrestore(&zone->lock, flags); 5655 spin_unlock_irqrestore(&zone->lock, flags);
5656 } 5656 }
5657 5657
5658 /* update totalreserve_pages */ 5658 /* update totalreserve_pages */
5659 calculate_totalreserve_pages(); 5659 calculate_totalreserve_pages();
5660 } 5660 }
5661 5661
5662 /** 5662 /**
5663 * setup_per_zone_wmarks - called when min_free_kbytes changes 5663 * setup_per_zone_wmarks - called when min_free_kbytes changes
5664 * or when memory is hot-{added|removed} 5664 * or when memory is hot-{added|removed}
5665 * 5665 *
5666 * Ensures that the watermark[min,low,high] values for each zone are set 5666 * Ensures that the watermark[min,low,high] values for each zone are set
5667 * correctly with respect to min_free_kbytes. 5667 * correctly with respect to min_free_kbytes.
5668 */ 5668 */
5669 void setup_per_zone_wmarks(void) 5669 void setup_per_zone_wmarks(void)
5670 { 5670 {
5671 mutex_lock(&zonelists_mutex); 5671 mutex_lock(&zonelists_mutex);
5672 __setup_per_zone_wmarks(); 5672 __setup_per_zone_wmarks();
5673 mutex_unlock(&zonelists_mutex); 5673 mutex_unlock(&zonelists_mutex);
5674 } 5674 }
5675 5675
5676 /* 5676 /*
5677 * The inactive anon list should be small enough that the VM never has to 5677 * The inactive anon list should be small enough that the VM never has to
5678 * do too much work, but large enough that each inactive page has a chance 5678 * do too much work, but large enough that each inactive page has a chance
5679 * to be referenced again before it is swapped out. 5679 * to be referenced again before it is swapped out.
5680 * 5680 *
5681 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5681 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5682 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5682 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5683 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5683 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5684 * the anonymous pages are kept on the inactive list. 5684 * the anonymous pages are kept on the inactive list.
5685 * 5685 *
5686 * total target max 5686 * total target max
5687 * memory ratio inactive anon 5687 * memory ratio inactive anon
5688 * ------------------------------------- 5688 * -------------------------------------
5689 * 10MB 1 5MB 5689 * 10MB 1 5MB
5690 * 100MB 1 50MB 5690 * 100MB 1 50MB
5691 * 1GB 3 250MB 5691 * 1GB 3 250MB
5692 * 10GB 10 0.9GB 5692 * 10GB 10 0.9GB
5693 * 100GB 31 3GB 5693 * 100GB 31 3GB
5694 * 1TB 101 10GB 5694 * 1TB 101 10GB
5695 * 10TB 320 32GB 5695 * 10TB 320 32GB
5696 */ 5696 */
5697 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5697 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5698 { 5698 {
5699 unsigned int gb, ratio; 5699 unsigned int gb, ratio;
5700 5700
5701 /* Zone size in gigabytes */ 5701 /* Zone size in gigabytes */
5702 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5702 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5703 if (gb) 5703 if (gb)
5704 ratio = int_sqrt(10 * gb); 5704 ratio = int_sqrt(10 * gb);
5705 else 5705 else
5706 ratio = 1; 5706 ratio = 1;
5707 5707
5708 zone->inactive_ratio = ratio; 5708 zone->inactive_ratio = ratio;
5709 } 5709 }
5710 5710
5711 static void __meminit setup_per_zone_inactive_ratio(void) 5711 static void __meminit setup_per_zone_inactive_ratio(void)
5712 { 5712 {
5713 struct zone *zone; 5713 struct zone *zone;
5714 5714
5715 for_each_zone(zone) 5715 for_each_zone(zone)
5716 calculate_zone_inactive_ratio(zone); 5716 calculate_zone_inactive_ratio(zone);
5717 } 5717 }
5718 5718
5719 /* 5719 /*
5720 * Initialise min_free_kbytes. 5720 * Initialise min_free_kbytes.
5721 * 5721 *
5722 * For small machines we want it small (128k min). For large machines 5722 * For small machines we want it small (128k min). For large machines
5723 * we want it large (64MB max). But it is not linear, because network 5723 * we want it large (64MB max). But it is not linear, because network
5724 * bandwidth does not increase linearly with machine size. We use 5724 * bandwidth does not increase linearly with machine size. We use
5725 * 5725 *
5726 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5726 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5727 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5727 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5728 * 5728 *
5729 * which yields 5729 * which yields
5730 * 5730 *
5731 * 16MB: 512k 5731 * 16MB: 512k
5732 * 32MB: 724k 5732 * 32MB: 724k
5733 * 64MB: 1024k 5733 * 64MB: 1024k
5734 * 128MB: 1448k 5734 * 128MB: 1448k
5735 * 256MB: 2048k 5735 * 256MB: 2048k
5736 * 512MB: 2896k 5736 * 512MB: 2896k
5737 * 1024MB: 4096k 5737 * 1024MB: 4096k
5738 * 2048MB: 5792k 5738 * 2048MB: 5792k
5739 * 4096MB: 8192k 5739 * 4096MB: 8192k
5740 * 8192MB: 11584k 5740 * 8192MB: 11584k
5741 * 16384MB: 16384k 5741 * 16384MB: 16384k
5742 */ 5742 */
5743 int __meminit init_per_zone_wmark_min(void) 5743 int __meminit init_per_zone_wmark_min(void)
5744 { 5744 {
5745 unsigned long lowmem_kbytes; 5745 unsigned long lowmem_kbytes;
5746 int new_min_free_kbytes; 5746 int new_min_free_kbytes;
5747 5747
5748 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5748 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5749 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5749 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5750 5750
5751 if (new_min_free_kbytes > user_min_free_kbytes) { 5751 if (new_min_free_kbytes > user_min_free_kbytes) {
5752 min_free_kbytes = new_min_free_kbytes; 5752 min_free_kbytes = new_min_free_kbytes;
5753 if (min_free_kbytes < 128) 5753 if (min_free_kbytes < 128)
5754 min_free_kbytes = 128; 5754 min_free_kbytes = 128;
5755 if (min_free_kbytes > 65536) 5755 if (min_free_kbytes > 65536)
5756 min_free_kbytes = 65536; 5756 min_free_kbytes = 65536;
5757 } else { 5757 } else {
5758 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5758 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5759 new_min_free_kbytes, user_min_free_kbytes); 5759 new_min_free_kbytes, user_min_free_kbytes);
5760 } 5760 }
5761 setup_per_zone_wmarks(); 5761 setup_per_zone_wmarks();
5762 refresh_zone_stat_thresholds(); 5762 refresh_zone_stat_thresholds();
5763 setup_per_zone_lowmem_reserve(); 5763 setup_per_zone_lowmem_reserve();
5764 setup_per_zone_inactive_ratio(); 5764 setup_per_zone_inactive_ratio();
5765 return 0; 5765 return 0;
5766 } 5766 }
5767 module_init(init_per_zone_wmark_min) 5767 module_init(init_per_zone_wmark_min)
5768 5768
5769 /* 5769 /*
5770 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5770 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5771 * that we can call two helper functions whenever min_free_kbytes 5771 * that we can call two helper functions whenever min_free_kbytes
5772 * changes. 5772 * changes.
5773 */ 5773 */
5774 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5774 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5775 void __user *buffer, size_t *length, loff_t *ppos) 5775 void __user *buffer, size_t *length, loff_t *ppos)
5776 { 5776 {
5777 int rc; 5777 int rc;
5778 5778
5779 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5779 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5780 if (rc) 5780 if (rc)
5781 return rc; 5781 return rc;
5782 5782
5783 if (write) { 5783 if (write) {
5784 user_min_free_kbytes = min_free_kbytes; 5784 user_min_free_kbytes = min_free_kbytes;
5785 setup_per_zone_wmarks(); 5785 setup_per_zone_wmarks();
5786 } 5786 }
5787 return 0; 5787 return 0;
5788 } 5788 }
5789 5789
5790 #ifdef CONFIG_NUMA 5790 #ifdef CONFIG_NUMA
5791 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5791 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5792 void __user *buffer, size_t *length, loff_t *ppos) 5792 void __user *buffer, size_t *length, loff_t *ppos)
5793 { 5793 {
5794 struct zone *zone; 5794 struct zone *zone;
5795 int rc; 5795 int rc;
5796 5796
5797 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5797 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5798 if (rc) 5798 if (rc)
5799 return rc; 5799 return rc;
5800 5800
5801 for_each_zone(zone) 5801 for_each_zone(zone)
5802 zone->min_unmapped_pages = (zone->managed_pages * 5802 zone->min_unmapped_pages = (zone->managed_pages *
5803 sysctl_min_unmapped_ratio) / 100; 5803 sysctl_min_unmapped_ratio) / 100;
5804 return 0; 5804 return 0;
5805 } 5805 }
5806 5806
5807 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5807 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5808 void __user *buffer, size_t *length, loff_t *ppos) 5808 void __user *buffer, size_t *length, loff_t *ppos)
5809 { 5809 {
5810 struct zone *zone; 5810 struct zone *zone;
5811 int rc; 5811 int rc;
5812 5812
5813 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5813 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5814 if (rc) 5814 if (rc)
5815 return rc; 5815 return rc;
5816 5816
5817 for_each_zone(zone) 5817 for_each_zone(zone)
5818 zone->min_slab_pages = (zone->managed_pages * 5818 zone->min_slab_pages = (zone->managed_pages *
5819 sysctl_min_slab_ratio) / 100; 5819 sysctl_min_slab_ratio) / 100;
5820 return 0; 5820 return 0;
5821 } 5821 }
5822 #endif 5822 #endif
5823 5823
5824 /* 5824 /*
5825 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5825 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5826 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5826 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5827 * whenever sysctl_lowmem_reserve_ratio changes. 5827 * whenever sysctl_lowmem_reserve_ratio changes.
5828 * 5828 *
5829 * The reserve ratio obviously has absolutely no relation with the 5829 * The reserve ratio obviously has absolutely no relation with the
5830 * minimum watermarks. The lowmem reserve ratio can only make sense 5830 * minimum watermarks. The lowmem reserve ratio can only make sense
5831 * if in function of the boot time zone sizes. 5831 * if in function of the boot time zone sizes.
5832 */ 5832 */
5833 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5833 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5834 void __user *buffer, size_t *length, loff_t *ppos) 5834 void __user *buffer, size_t *length, loff_t *ppos)
5835 { 5835 {
5836 proc_dointvec_minmax(table, write, buffer, length, ppos); 5836 proc_dointvec_minmax(table, write, buffer, length, ppos);
5837 setup_per_zone_lowmem_reserve(); 5837 setup_per_zone_lowmem_reserve();
5838 return 0; 5838 return 0;
5839 } 5839 }
5840 5840
5841 /* 5841 /*
5842 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5842 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5843 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5843 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5844 * pagelist can have before it gets flushed back to buddy allocator. 5844 * pagelist can have before it gets flushed back to buddy allocator.
5845 */ 5845 */
5846 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5846 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5847 void __user *buffer, size_t *length, loff_t *ppos) 5847 void __user *buffer, size_t *length, loff_t *ppos)
5848 { 5848 {
5849 struct zone *zone; 5849 struct zone *zone;
5850 int old_percpu_pagelist_fraction; 5850 int old_percpu_pagelist_fraction;
5851 int ret; 5851 int ret;
5852 5852
5853 mutex_lock(&pcp_batch_high_lock); 5853 mutex_lock(&pcp_batch_high_lock);
5854 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 5854 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5855 5855
5856 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5856 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5857 if (!write || ret < 0) 5857 if (!write || ret < 0)
5858 goto out; 5858 goto out;
5859 5859
5860 /* Sanity checking to avoid pcp imbalance */ 5860 /* Sanity checking to avoid pcp imbalance */
5861 if (percpu_pagelist_fraction && 5861 if (percpu_pagelist_fraction &&
5862 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 5862 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5863 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 5863 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5864 ret = -EINVAL; 5864 ret = -EINVAL;
5865 goto out; 5865 goto out;
5866 } 5866 }
5867 5867
5868 /* No change? */ 5868 /* No change? */
5869 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 5869 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5870 goto out; 5870 goto out;
5871 5871
5872 for_each_populated_zone(zone) { 5872 for_each_populated_zone(zone) {
5873 unsigned int cpu; 5873 unsigned int cpu;
5874 5874
5875 for_each_possible_cpu(cpu) 5875 for_each_possible_cpu(cpu)
5876 pageset_set_high_and_batch(zone, 5876 pageset_set_high_and_batch(zone,
5877 per_cpu_ptr(zone->pageset, cpu)); 5877 per_cpu_ptr(zone->pageset, cpu));
5878 } 5878 }
5879 out: 5879 out:
5880 mutex_unlock(&pcp_batch_high_lock); 5880 mutex_unlock(&pcp_batch_high_lock);
5881 return ret; 5881 return ret;
5882 } 5882 }
5883 5883
5884 int hashdist = HASHDIST_DEFAULT; 5884 int hashdist = HASHDIST_DEFAULT;
5885 5885
5886 #ifdef CONFIG_NUMA 5886 #ifdef CONFIG_NUMA
5887 static int __init set_hashdist(char *str) 5887 static int __init set_hashdist(char *str)
5888 { 5888 {
5889 if (!str) 5889 if (!str)
5890 return 0; 5890 return 0;
5891 hashdist = simple_strtoul(str, &str, 0); 5891 hashdist = simple_strtoul(str, &str, 0);
5892 return 1; 5892 return 1;
5893 } 5893 }
5894 __setup("hashdist=", set_hashdist); 5894 __setup("hashdist=", set_hashdist);
5895 #endif 5895 #endif
5896 5896
5897 /* 5897 /*
5898 * allocate a large system hash table from bootmem 5898 * allocate a large system hash table from bootmem
5899 * - it is assumed that the hash table must contain an exact power-of-2 5899 * - it is assumed that the hash table must contain an exact power-of-2
5900 * quantity of entries 5900 * quantity of entries
5901 * - limit is the number of hash buckets, not the total allocation size 5901 * - limit is the number of hash buckets, not the total allocation size
5902 */ 5902 */
5903 void *__init alloc_large_system_hash(const char *tablename, 5903 void *__init alloc_large_system_hash(const char *tablename,
5904 unsigned long bucketsize, 5904 unsigned long bucketsize,
5905 unsigned long numentries, 5905 unsigned long numentries,
5906 int scale, 5906 int scale,
5907 int flags, 5907 int flags,
5908 unsigned int *_hash_shift, 5908 unsigned int *_hash_shift,
5909 unsigned int *_hash_mask, 5909 unsigned int *_hash_mask,
5910 unsigned long low_limit, 5910 unsigned long low_limit,
5911 unsigned long high_limit) 5911 unsigned long high_limit)
5912 { 5912 {
5913 unsigned long long max = high_limit; 5913 unsigned long long max = high_limit;
5914 unsigned long log2qty, size; 5914 unsigned long log2qty, size;
5915 void *table = NULL; 5915 void *table = NULL;
5916 5916
5917 /* allow the kernel cmdline to have a say */ 5917 /* allow the kernel cmdline to have a say */
5918 if (!numentries) { 5918 if (!numentries) {
5919 /* round applicable memory size up to nearest megabyte */ 5919 /* round applicable memory size up to nearest megabyte */
5920 numentries = nr_kernel_pages; 5920 numentries = nr_kernel_pages;
5921 5921
5922 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5922 /* It isn't necessary when PAGE_SIZE >= 1MB */
5923 if (PAGE_SHIFT < 20) 5923 if (PAGE_SHIFT < 20)
5924 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5924 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5925 5925
5926 /* limit to 1 bucket per 2^scale bytes of low memory */ 5926 /* limit to 1 bucket per 2^scale bytes of low memory */
5927 if (scale > PAGE_SHIFT) 5927 if (scale > PAGE_SHIFT)
5928 numentries >>= (scale - PAGE_SHIFT); 5928 numentries >>= (scale - PAGE_SHIFT);
5929 else 5929 else
5930 numentries <<= (PAGE_SHIFT - scale); 5930 numentries <<= (PAGE_SHIFT - scale);
5931 5931
5932 /* Make sure we've got at least a 0-order allocation.. */ 5932 /* Make sure we've got at least a 0-order allocation.. */
5933 if (unlikely(flags & HASH_SMALL)) { 5933 if (unlikely(flags & HASH_SMALL)) {
5934 /* Makes no sense without HASH_EARLY */ 5934 /* Makes no sense without HASH_EARLY */
5935 WARN_ON(!(flags & HASH_EARLY)); 5935 WARN_ON(!(flags & HASH_EARLY));
5936 if (!(numentries >> *_hash_shift)) { 5936 if (!(numentries >> *_hash_shift)) {
5937 numentries = 1UL << *_hash_shift; 5937 numentries = 1UL << *_hash_shift;
5938 BUG_ON(!numentries); 5938 BUG_ON(!numentries);
5939 } 5939 }
5940 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5940 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5941 numentries = PAGE_SIZE / bucketsize; 5941 numentries = PAGE_SIZE / bucketsize;
5942 } 5942 }
5943 numentries = roundup_pow_of_two(numentries); 5943 numentries = roundup_pow_of_two(numentries);
5944 5944
5945 /* limit allocation size to 1/16 total memory by default */ 5945 /* limit allocation size to 1/16 total memory by default */
5946 if (max == 0) { 5946 if (max == 0) {
5947 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5947 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5948 do_div(max, bucketsize); 5948 do_div(max, bucketsize);
5949 } 5949 }
5950 max = min(max, 0x80000000ULL); 5950 max = min(max, 0x80000000ULL);
5951 5951
5952 if (numentries < low_limit) 5952 if (numentries < low_limit)
5953 numentries = low_limit; 5953 numentries = low_limit;
5954 if (numentries > max) 5954 if (numentries > max)
5955 numentries = max; 5955 numentries = max;
5956 5956
5957 log2qty = ilog2(numentries); 5957 log2qty = ilog2(numentries);
5958 5958
5959 do { 5959 do {
5960 size = bucketsize << log2qty; 5960 size = bucketsize << log2qty;
5961 if (flags & HASH_EARLY) 5961 if (flags & HASH_EARLY)
5962 table = alloc_bootmem_nopanic(size); 5962 table = alloc_bootmem_nopanic(size);
5963 else if (hashdist) 5963 else if (hashdist)
5964 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5964 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5965 else { 5965 else {
5966 /* 5966 /*
5967 * If bucketsize is not a power-of-two, we may free 5967 * If bucketsize is not a power-of-two, we may free
5968 * some pages at the end of hash table which 5968 * some pages at the end of hash table which
5969 * alloc_pages_exact() automatically does 5969 * alloc_pages_exact() automatically does
5970 */ 5970 */
5971 if (get_order(size) < MAX_ORDER) { 5971 if (get_order(size) < MAX_ORDER) {
5972 table = alloc_pages_exact(size, GFP_ATOMIC); 5972 table = alloc_pages_exact(size, GFP_ATOMIC);
5973 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5973 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5974 } 5974 }
5975 } 5975 }
5976 } while (!table && size > PAGE_SIZE && --log2qty); 5976 } while (!table && size > PAGE_SIZE && --log2qty);
5977 5977
5978 if (!table) 5978 if (!table)
5979 panic("Failed to allocate %s hash table\n", tablename); 5979 panic("Failed to allocate %s hash table\n", tablename);
5980 5980
5981 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5981 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5982 tablename, 5982 tablename,
5983 (1UL << log2qty), 5983 (1UL << log2qty),
5984 ilog2(size) - PAGE_SHIFT, 5984 ilog2(size) - PAGE_SHIFT,
5985 size); 5985 size);
5986 5986
5987 if (_hash_shift) 5987 if (_hash_shift)
5988 *_hash_shift = log2qty; 5988 *_hash_shift = log2qty;
5989 if (_hash_mask) 5989 if (_hash_mask)
5990 *_hash_mask = (1 << log2qty) - 1; 5990 *_hash_mask = (1 << log2qty) - 1;
5991 5991
5992 return table; 5992 return table;
5993 } 5993 }
5994 5994
5995 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5995 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5996 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5996 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5997 unsigned long pfn) 5997 unsigned long pfn)
5998 { 5998 {
5999 #ifdef CONFIG_SPARSEMEM 5999 #ifdef CONFIG_SPARSEMEM
6000 return __pfn_to_section(pfn)->pageblock_flags; 6000 return __pfn_to_section(pfn)->pageblock_flags;
6001 #else 6001 #else
6002 return zone->pageblock_flags; 6002 return zone->pageblock_flags;
6003 #endif /* CONFIG_SPARSEMEM */ 6003 #endif /* CONFIG_SPARSEMEM */
6004 } 6004 }
6005 6005
6006 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 6006 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6007 { 6007 {
6008 #ifdef CONFIG_SPARSEMEM 6008 #ifdef CONFIG_SPARSEMEM
6009 pfn &= (PAGES_PER_SECTION-1); 6009 pfn &= (PAGES_PER_SECTION-1);
6010 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6010 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
6011 #else 6011 #else
6012 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 6012 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
6013 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6013 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
6014 #endif /* CONFIG_SPARSEMEM */ 6014 #endif /* CONFIG_SPARSEMEM */
6015 } 6015 }
6016 6016
6017 /** 6017 /**
6018 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 6018 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
6019 * @page: The page within the block of interest 6019 * @page: The page within the block of interest
6020 * @start_bitidx: The first bit of interest to retrieve 6020 * @start_bitidx: The first bit of interest to retrieve
6021 * @end_bitidx: The last bit of interest 6021 * @end_bitidx: The last bit of interest
6022 * returns pageblock_bits flags 6022 * returns pageblock_bits flags
6023 */ 6023 */
6024 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 6024 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
6025 unsigned long end_bitidx, 6025 unsigned long end_bitidx,
6026 unsigned long mask) 6026 unsigned long mask)
6027 { 6027 {
6028 struct zone *zone; 6028 struct zone *zone;
6029 unsigned long *bitmap; 6029 unsigned long *bitmap;
6030 unsigned long bitidx, word_bitidx; 6030 unsigned long bitidx, word_bitidx;
6031 unsigned long word; 6031 unsigned long word;
6032 6032
6033 zone = page_zone(page); 6033 zone = page_zone(page);
6034 bitmap = get_pageblock_bitmap(zone, pfn); 6034 bitmap = get_pageblock_bitmap(zone, pfn);
6035 bitidx = pfn_to_bitidx(zone, pfn); 6035 bitidx = pfn_to_bitidx(zone, pfn);
6036 word_bitidx = bitidx / BITS_PER_LONG; 6036 word_bitidx = bitidx / BITS_PER_LONG;
6037 bitidx &= (BITS_PER_LONG-1); 6037 bitidx &= (BITS_PER_LONG-1);
6038 6038
6039 word = bitmap[word_bitidx]; 6039 word = bitmap[word_bitidx];
6040 bitidx += end_bitidx; 6040 bitidx += end_bitidx;
6041 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6041 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6042 } 6042 }
6043 6043
6044 /** 6044 /**
6045 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6045 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6046 * @page: The page within the block of interest 6046 * @page: The page within the block of interest
6047 * @start_bitidx: The first bit of interest 6047 * @start_bitidx: The first bit of interest
6048 * @end_bitidx: The last bit of interest 6048 * @end_bitidx: The last bit of interest
6049 * @flags: The flags to set 6049 * @flags: The flags to set
6050 */ 6050 */
6051 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 6051 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6052 unsigned long pfn, 6052 unsigned long pfn,
6053 unsigned long end_bitidx, 6053 unsigned long end_bitidx,
6054 unsigned long mask) 6054 unsigned long mask)
6055 { 6055 {
6056 struct zone *zone; 6056 struct zone *zone;
6057 unsigned long *bitmap; 6057 unsigned long *bitmap;
6058 unsigned long bitidx, word_bitidx; 6058 unsigned long bitidx, word_bitidx;
6059 unsigned long old_word, word; 6059 unsigned long old_word, word;
6060 6060
6061 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6061 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6062 6062
6063 zone = page_zone(page); 6063 zone = page_zone(page);
6064 bitmap = get_pageblock_bitmap(zone, pfn); 6064 bitmap = get_pageblock_bitmap(zone, pfn);
6065 bitidx = pfn_to_bitidx(zone, pfn); 6065 bitidx = pfn_to_bitidx(zone, pfn);
6066 word_bitidx = bitidx / BITS_PER_LONG; 6066 word_bitidx = bitidx / BITS_PER_LONG;
6067 bitidx &= (BITS_PER_LONG-1); 6067 bitidx &= (BITS_PER_LONG-1);
6068 6068
6069 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6069 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
6070 6070
6071 bitidx += end_bitidx; 6071 bitidx += end_bitidx;
6072 mask <<= (BITS_PER_LONG - bitidx - 1); 6072 mask <<= (BITS_PER_LONG - bitidx - 1);
6073 flags <<= (BITS_PER_LONG - bitidx - 1); 6073 flags <<= (BITS_PER_LONG - bitidx - 1);
6074 6074
6075 word = ACCESS_ONCE(bitmap[word_bitidx]); 6075 word = ACCESS_ONCE(bitmap[word_bitidx]);
6076 for (;;) { 6076 for (;;) {
6077 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6077 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6078 if (word == old_word) 6078 if (word == old_word)
6079 break; 6079 break;
6080 word = old_word; 6080 word = old_word;
6081 } 6081 }
6082 } 6082 }
6083 6083
6084 /* 6084 /*
6085 * This function checks whether pageblock includes unmovable pages or not. 6085 * This function checks whether pageblock includes unmovable pages or not.
6086 * If @count is not zero, it is okay to include less @count unmovable pages 6086 * If @count is not zero, it is okay to include less @count unmovable pages
6087 * 6087 *
6088 * PageLRU check without isolation or lru_lock could race so that 6088 * PageLRU check without isolation or lru_lock could race so that
6089 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6089 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6090 * expect this function should be exact. 6090 * expect this function should be exact.
6091 */ 6091 */
6092 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6092 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6093 bool skip_hwpoisoned_pages) 6093 bool skip_hwpoisoned_pages)
6094 { 6094 {
6095 unsigned long pfn, iter, found; 6095 unsigned long pfn, iter, found;
6096 int mt; 6096 int mt;
6097 6097
6098 /* 6098 /*
6099 * For avoiding noise data, lru_add_drain_all() should be called 6099 * For avoiding noise data, lru_add_drain_all() should be called
6100 * If ZONE_MOVABLE, the zone never contains unmovable pages 6100 * If ZONE_MOVABLE, the zone never contains unmovable pages
6101 */ 6101 */
6102 if (zone_idx(zone) == ZONE_MOVABLE) 6102 if (zone_idx(zone) == ZONE_MOVABLE)
6103 return false; 6103 return false;
6104 mt = get_pageblock_migratetype(page); 6104 mt = get_pageblock_migratetype(page);
6105 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6105 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
6106 return false; 6106 return false;
6107 6107
6108 pfn = page_to_pfn(page); 6108 pfn = page_to_pfn(page);
6109 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6109 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6110 unsigned long check = pfn + iter; 6110 unsigned long check = pfn + iter;
6111 6111
6112 if (!pfn_valid_within(check)) 6112 if (!pfn_valid_within(check))
6113 continue; 6113 continue;
6114 6114
6115 page = pfn_to_page(check); 6115 page = pfn_to_page(check);
6116 6116
6117 /* 6117 /*
6118 * Hugepages are not in LRU lists, but they're movable. 6118 * Hugepages are not in LRU lists, but they're movable.
6119 * We need not scan over tail pages bacause we don't 6119 * We need not scan over tail pages bacause we don't
6120 * handle each tail page individually in migration. 6120 * handle each tail page individually in migration.
6121 */ 6121 */
6122 if (PageHuge(page)) { 6122 if (PageHuge(page)) {
6123 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6123 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6124 continue; 6124 continue;
6125 } 6125 }
6126 6126
6127 /* 6127 /*
6128 * We can't use page_count without pin a page 6128 * We can't use page_count without pin a page
6129 * because another CPU can free compound page. 6129 * because another CPU can free compound page.
6130 * This check already skips compound tails of THP 6130 * This check already skips compound tails of THP
6131 * because their page->_count is zero at all time. 6131 * because their page->_count is zero at all time.
6132 */ 6132 */
6133 if (!atomic_read(&page->_count)) { 6133 if (!atomic_read(&page->_count)) {
6134 if (PageBuddy(page)) 6134 if (PageBuddy(page))
6135 iter += (1 << page_order(page)) - 1; 6135 iter += (1 << page_order(page)) - 1;
6136 continue; 6136 continue;
6137 } 6137 }
6138 6138
6139 /* 6139 /*
6140 * The HWPoisoned page may be not in buddy system, and 6140 * The HWPoisoned page may be not in buddy system, and
6141 * page_count() is not 0. 6141 * page_count() is not 0.
6142 */ 6142 */
6143 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6143 if (skip_hwpoisoned_pages && PageHWPoison(page))
6144 continue; 6144 continue;
6145 6145
6146 if (!PageLRU(page)) 6146 if (!PageLRU(page))
6147 found++; 6147 found++;
6148 /* 6148 /*
6149 * If there are RECLAIMABLE pages, we need to check it. 6149 * If there are RECLAIMABLE pages, we need to check it.
6150 * But now, memory offline itself doesn't call shrink_slab() 6150 * But now, memory offline itself doesn't call shrink_slab()
6151 * and it still to be fixed. 6151 * and it still to be fixed.
6152 */ 6152 */
6153 /* 6153 /*
6154 * If the page is not RAM, page_count()should be 0. 6154 * If the page is not RAM, page_count()should be 0.
6155 * we don't need more check. This is an _used_ not-movable page. 6155 * we don't need more check. This is an _used_ not-movable page.
6156 * 6156 *
6157 * The problematic thing here is PG_reserved pages. PG_reserved 6157 * The problematic thing here is PG_reserved pages. PG_reserved
6158 * is set to both of a memory hole page and a _used_ kernel 6158 * is set to both of a memory hole page and a _used_ kernel
6159 * page at boot. 6159 * page at boot.
6160 */ 6160 */
6161 if (found > count) 6161 if (found > count)
6162 return true; 6162 return true;
6163 } 6163 }
6164 return false; 6164 return false;
6165 } 6165 }
6166 6166
6167 bool is_pageblock_removable_nolock(struct page *page) 6167 bool is_pageblock_removable_nolock(struct page *page)
6168 { 6168 {
6169 struct zone *zone; 6169 struct zone *zone;
6170 unsigned long pfn; 6170 unsigned long pfn;
6171 6171
6172 /* 6172 /*
6173 * We have to be careful here because we are iterating over memory 6173 * We have to be careful here because we are iterating over memory
6174 * sections which are not zone aware so we might end up outside of 6174 * sections which are not zone aware so we might end up outside of
6175 * the zone but still within the section. 6175 * the zone but still within the section.
6176 * We have to take care about the node as well. If the node is offline 6176 * We have to take care about the node as well. If the node is offline
6177 * its NODE_DATA will be NULL - see page_zone. 6177 * its NODE_DATA will be NULL - see page_zone.
6178 */ 6178 */
6179 if (!node_online(page_to_nid(page))) 6179 if (!node_online(page_to_nid(page)))
6180 return false; 6180 return false;
6181 6181
6182 zone = page_zone(page); 6182 zone = page_zone(page);
6183 pfn = page_to_pfn(page); 6183 pfn = page_to_pfn(page);
6184 if (!zone_spans_pfn(zone, pfn)) 6184 if (!zone_spans_pfn(zone, pfn))
6185 return false; 6185 return false;
6186 6186
6187 return !has_unmovable_pages(zone, page, 0, true); 6187 return !has_unmovable_pages(zone, page, 0, true);
6188 } 6188 }
6189 6189
6190 #ifdef CONFIG_CMA 6190 #ifdef CONFIG_CMA
6191 6191
6192 static unsigned long pfn_max_align_down(unsigned long pfn) 6192 static unsigned long pfn_max_align_down(unsigned long pfn)
6193 { 6193 {
6194 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6194 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6195 pageblock_nr_pages) - 1); 6195 pageblock_nr_pages) - 1);
6196 } 6196 }
6197 6197
6198 static unsigned long pfn_max_align_up(unsigned long pfn) 6198 static unsigned long pfn_max_align_up(unsigned long pfn)
6199 { 6199 {
6200 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6200 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6201 pageblock_nr_pages)); 6201 pageblock_nr_pages));
6202 } 6202 }
6203 6203
6204 /* [start, end) must belong to a single zone. */ 6204 /* [start, end) must belong to a single zone. */
6205 static int __alloc_contig_migrate_range(struct compact_control *cc, 6205 static int __alloc_contig_migrate_range(struct compact_control *cc,
6206 unsigned long start, unsigned long end) 6206 unsigned long start, unsigned long end)
6207 { 6207 {
6208 /* This function is based on compact_zone() from compaction.c. */ 6208 /* This function is based on compact_zone() from compaction.c. */
6209 unsigned long nr_reclaimed; 6209 unsigned long nr_reclaimed;
6210 unsigned long pfn = start; 6210 unsigned long pfn = start;
6211 unsigned int tries = 0; 6211 unsigned int tries = 0;
6212 int ret = 0; 6212 int ret = 0;
6213 6213
6214 migrate_prep(); 6214 migrate_prep();
6215 6215
6216 while (pfn < end || !list_empty(&cc->migratepages)) { 6216 while (pfn < end || !list_empty(&cc->migratepages)) {
6217 if (fatal_signal_pending(current)) { 6217 if (fatal_signal_pending(current)) {
6218 ret = -EINTR; 6218 ret = -EINTR;
6219 break; 6219 break;
6220 } 6220 }
6221 6221
6222 if (list_empty(&cc->migratepages)) { 6222 if (list_empty(&cc->migratepages)) {
6223 cc->nr_migratepages = 0; 6223 cc->nr_migratepages = 0;
6224 pfn = isolate_migratepages_range(cc->zone, cc, 6224 pfn = isolate_migratepages_range(cc->zone, cc,
6225 pfn, end, true); 6225 pfn, end, true);
6226 if (!pfn) { 6226 if (!pfn) {
6227 ret = -EINTR; 6227 ret = -EINTR;
6228 break; 6228 break;
6229 } 6229 }
6230 tries = 0; 6230 tries = 0;
6231 } else if (++tries == 5) { 6231 } else if (++tries == 5) {
6232 ret = ret < 0 ? ret : -EBUSY; 6232 ret = ret < 0 ? ret : -EBUSY;
6233 break; 6233 break;
6234 } 6234 }
6235 6235
6236 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6236 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6237 &cc->migratepages); 6237 &cc->migratepages);
6238 cc->nr_migratepages -= nr_reclaimed; 6238 cc->nr_migratepages -= nr_reclaimed;
6239 6239
6240 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6240 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6241 NULL, 0, cc->mode, MR_CMA); 6241 NULL, 0, cc->mode, MR_CMA);
6242 } 6242 }
6243 if (ret < 0) { 6243 if (ret < 0) {
6244 putback_movable_pages(&cc->migratepages); 6244 putback_movable_pages(&cc->migratepages);
6245 return ret; 6245 return ret;
6246 } 6246 }
6247 return 0; 6247 return 0;
6248 } 6248 }
6249 6249
6250 /** 6250 /**
6251 * alloc_contig_range() -- tries to allocate given range of pages 6251 * alloc_contig_range() -- tries to allocate given range of pages
6252 * @start: start PFN to allocate 6252 * @start: start PFN to allocate
6253 * @end: one-past-the-last PFN to allocate 6253 * @end: one-past-the-last PFN to allocate
6254 * @migratetype: migratetype of the underlaying pageblocks (either 6254 * @migratetype: migratetype of the underlaying pageblocks (either
6255 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6255 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6256 * in range must have the same migratetype and it must 6256 * in range must have the same migratetype and it must
6257 * be either of the two. 6257 * be either of the two.
6258 * 6258 *
6259 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6259 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6260 * aligned, however it's the caller's responsibility to guarantee that 6260 * aligned, however it's the caller's responsibility to guarantee that
6261 * we are the only thread that changes migrate type of pageblocks the 6261 * we are the only thread that changes migrate type of pageblocks the
6262 * pages fall in. 6262 * pages fall in.
6263 * 6263 *
6264 * The PFN range must belong to a single zone. 6264 * The PFN range must belong to a single zone.
6265 * 6265 *
6266 * Returns zero on success or negative error code. On success all 6266 * Returns zero on success or negative error code. On success all
6267 * pages which PFN is in [start, end) are allocated for the caller and 6267 * pages which PFN is in [start, end) are allocated for the caller and
6268 * need to be freed with free_contig_range(). 6268 * need to be freed with free_contig_range().
6269 */ 6269 */
6270 int alloc_contig_range(unsigned long start, unsigned long end, 6270 int alloc_contig_range(unsigned long start, unsigned long end,
6271 unsigned migratetype) 6271 unsigned migratetype)
6272 { 6272 {
6273 unsigned long outer_start, outer_end; 6273 unsigned long outer_start, outer_end;
6274 int ret = 0, order; 6274 int ret = 0, order;
6275 6275
6276 struct compact_control cc = { 6276 struct compact_control cc = {
6277 .nr_migratepages = 0, 6277 .nr_migratepages = 0,
6278 .order = -1, 6278 .order = -1,
6279 .zone = page_zone(pfn_to_page(start)), 6279 .zone = page_zone(pfn_to_page(start)),
6280 .mode = MIGRATE_SYNC, 6280 .mode = MIGRATE_SYNC,
6281 .ignore_skip_hint = true, 6281 .ignore_skip_hint = true,
6282 }; 6282 };
6283 INIT_LIST_HEAD(&cc.migratepages); 6283 INIT_LIST_HEAD(&cc.migratepages);
6284 6284
6285 /* 6285 /*
6286 * What we do here is we mark all pageblocks in range as 6286 * What we do here is we mark all pageblocks in range as
6287 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6287 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6288 * have different sizes, and due to the way page allocator 6288 * have different sizes, and due to the way page allocator
6289 * work, we align the range to biggest of the two pages so 6289 * work, we align the range to biggest of the two pages so
6290 * that page allocator won't try to merge buddies from 6290 * that page allocator won't try to merge buddies from
6291 * different pageblocks and change MIGRATE_ISOLATE to some 6291 * different pageblocks and change MIGRATE_ISOLATE to some
6292 * other migration type. 6292 * other migration type.
6293 * 6293 *
6294 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6294 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6295 * migrate the pages from an unaligned range (ie. pages that 6295 * migrate the pages from an unaligned range (ie. pages that
6296 * we are interested in). This will put all the pages in 6296 * we are interested in). This will put all the pages in
6297 * range back to page allocator as MIGRATE_ISOLATE. 6297 * range back to page allocator as MIGRATE_ISOLATE.
6298 * 6298 *
6299 * When this is done, we take the pages in range from page 6299 * When this is done, we take the pages in range from page
6300 * allocator removing them from the buddy system. This way 6300 * allocator removing them from the buddy system. This way
6301 * page allocator will never consider using them. 6301 * page allocator will never consider using them.
6302 * 6302 *
6303 * This lets us mark the pageblocks back as 6303 * This lets us mark the pageblocks back as
6304 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6304 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6305 * aligned range but not in the unaligned, original range are 6305 * aligned range but not in the unaligned, original range are
6306 * put back to page allocator so that buddy can use them. 6306 * put back to page allocator so that buddy can use them.
6307 */ 6307 */
6308 6308
6309 ret = start_isolate_page_range(pfn_max_align_down(start), 6309 ret = start_isolate_page_range(pfn_max_align_down(start),
6310 pfn_max_align_up(end), migratetype, 6310 pfn_max_align_up(end), migratetype,
6311 false); 6311 false);
6312 if (ret) 6312 if (ret)
6313 return ret; 6313 return ret;
6314 6314
6315 ret = __alloc_contig_migrate_range(&cc, start, end); 6315 ret = __alloc_contig_migrate_range(&cc, start, end);
6316 if (ret) 6316 if (ret)
6317 goto done; 6317 goto done;
6318 6318
6319 /* 6319 /*
6320 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6320 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6321 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6321 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6322 * more, all pages in [start, end) are free in page allocator. 6322 * more, all pages in [start, end) are free in page allocator.
6323 * What we are going to do is to allocate all pages from 6323 * What we are going to do is to allocate all pages from
6324 * [start, end) (that is remove them from page allocator). 6324 * [start, end) (that is remove them from page allocator).
6325 * 6325 *
6326 * The only problem is that pages at the beginning and at the 6326 * The only problem is that pages at the beginning and at the
6327 * end of interesting range may be not aligned with pages that 6327 * end of interesting range may be not aligned with pages that
6328 * page allocator holds, ie. they can be part of higher order 6328 * page allocator holds, ie. they can be part of higher order
6329 * pages. Because of this, we reserve the bigger range and 6329 * pages. Because of this, we reserve the bigger range and
6330 * once this is done free the pages we are not interested in. 6330 * once this is done free the pages we are not interested in.
6331 * 6331 *
6332 * We don't have to hold zone->lock here because the pages are 6332 * We don't have to hold zone->lock here because the pages are
6333 * isolated thus they won't get removed from buddy. 6333 * isolated thus they won't get removed from buddy.
6334 */ 6334 */
6335 6335
6336 lru_add_drain_all(); 6336 lru_add_drain_all();
6337 drain_all_pages(); 6337 drain_all_pages();
6338 6338
6339 order = 0; 6339 order = 0;
6340 outer_start = start; 6340 outer_start = start;
6341 while (!PageBuddy(pfn_to_page(outer_start))) { 6341 while (!PageBuddy(pfn_to_page(outer_start))) {
6342 if (++order >= MAX_ORDER) { 6342 if (++order >= MAX_ORDER) {
6343 ret = -EBUSY; 6343 ret = -EBUSY;
6344 goto done; 6344 goto done;
6345 } 6345 }
6346 outer_start &= ~0UL << order; 6346 outer_start &= ~0UL << order;
6347 } 6347 }
6348 6348
6349 /* Make sure the range is really isolated. */ 6349 /* Make sure the range is really isolated. */
6350 if (test_pages_isolated(outer_start, end, false)) { 6350 if (test_pages_isolated(outer_start, end, false)) {
6351 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6351 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6352 outer_start, end); 6352 outer_start, end);
6353 ret = -EBUSY; 6353 ret = -EBUSY;
6354 goto done; 6354 goto done;
6355 } 6355 }
6356 6356
6357 6357
6358 /* Grab isolated pages from freelists. */ 6358 /* Grab isolated pages from freelists. */
6359 outer_end = isolate_freepages_range(&cc, outer_start, end); 6359 outer_end = isolate_freepages_range(&cc, outer_start, end);
6360 if (!outer_end) { 6360 if (!outer_end) {
6361 ret = -EBUSY; 6361 ret = -EBUSY;
6362 goto done; 6362 goto done;
6363 } 6363 }
6364 6364
6365 /* Free head and tail (if any) */ 6365 /* Free head and tail (if any) */
6366 if (start != outer_start) 6366 if (start != outer_start)
6367 free_contig_range(outer_start, start - outer_start); 6367 free_contig_range(outer_start, start - outer_start);
6368 if (end != outer_end) 6368 if (end != outer_end)
6369 free_contig_range(end, outer_end - end); 6369 free_contig_range(end, outer_end - end);
6370 6370
6371 done: 6371 done:
6372 undo_isolate_page_range(pfn_max_align_down(start), 6372 undo_isolate_page_range(pfn_max_align_down(start),
6373 pfn_max_align_up(end), migratetype); 6373 pfn_max_align_up(end), migratetype);
6374 return ret; 6374 return ret;
6375 } 6375 }
6376 6376
6377 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6377 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6378 { 6378 {
6379 unsigned int count = 0; 6379 unsigned int count = 0;
6380 6380
6381 for (; nr_pages--; pfn++) { 6381 for (; nr_pages--; pfn++) {
6382 struct page *page = pfn_to_page(pfn); 6382 struct page *page = pfn_to_page(pfn);
6383 6383
6384 count += page_count(page) != 1; 6384 count += page_count(page) != 1;
6385 __free_page(page); 6385 __free_page(page);
6386 } 6386 }
6387 WARN(count != 0, "%d pages are still in use!\n", count); 6387 WARN(count != 0, "%d pages are still in use!\n", count);
6388 } 6388 }
6389 #endif 6389 #endif
6390 6390
6391 #ifdef CONFIG_MEMORY_HOTPLUG 6391 #ifdef CONFIG_MEMORY_HOTPLUG
6392 /* 6392 /*
6393 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6393 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6394 * page high values need to be recalulated. 6394 * page high values need to be recalulated.
6395 */ 6395 */
6396 void __meminit zone_pcp_update(struct zone *zone) 6396 void __meminit zone_pcp_update(struct zone *zone)
6397 { 6397 {
6398 unsigned cpu; 6398 unsigned cpu;
6399 mutex_lock(&pcp_batch_high_lock); 6399 mutex_lock(&pcp_batch_high_lock);
6400 for_each_possible_cpu(cpu) 6400 for_each_possible_cpu(cpu)
6401 pageset_set_high_and_batch(zone, 6401 pageset_set_high_and_batch(zone,
6402 per_cpu_ptr(zone->pageset, cpu)); 6402 per_cpu_ptr(zone->pageset, cpu));
6403 mutex_unlock(&pcp_batch_high_lock); 6403 mutex_unlock(&pcp_batch_high_lock);
6404 } 6404 }
6405 #endif 6405 #endif
6406 6406
6407 void zone_pcp_reset(struct zone *zone) 6407 void zone_pcp_reset(struct zone *zone)
6408 { 6408 {
6409 unsigned long flags; 6409 unsigned long flags;
6410 int cpu; 6410 int cpu;
6411 struct per_cpu_pageset *pset; 6411 struct per_cpu_pageset *pset;
6412 6412
6413 /* avoid races with drain_pages() */ 6413 /* avoid races with drain_pages() */
6414 local_irq_save(flags); 6414 local_irq_save(flags);
6415 if (zone->pageset != &boot_pageset) { 6415 if (zone->pageset != &boot_pageset) {
6416 for_each_online_cpu(cpu) { 6416 for_each_online_cpu(cpu) {
6417 pset = per_cpu_ptr(zone->pageset, cpu); 6417 pset = per_cpu_ptr(zone->pageset, cpu);
6418 drain_zonestat(zone, pset); 6418 drain_zonestat(zone, pset);
6419 } 6419 }
6420 free_percpu(zone->pageset); 6420 free_percpu(zone->pageset);
6421 zone->pageset = &boot_pageset; 6421 zone->pageset = &boot_pageset;
6422 } 6422 }
6423 local_irq_restore(flags); 6423 local_irq_restore(flags);
6424 } 6424 }
6425 6425
6426 #ifdef CONFIG_MEMORY_HOTREMOVE 6426 #ifdef CONFIG_MEMORY_HOTREMOVE
6427 /* 6427 /*
6428 * All pages in the range must be isolated before calling this. 6428 * All pages in the range must be isolated before calling this.
6429 */ 6429 */
6430 void 6430 void
6431 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6431 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6432 { 6432 {
6433 struct page *page; 6433 struct page *page;
6434 struct zone *zone; 6434 struct zone *zone;
6435 unsigned int order, i; 6435 unsigned int order, i;
6436 unsigned long pfn; 6436 unsigned long pfn;
6437 unsigned long flags; 6437 unsigned long flags;
6438 /* find the first valid pfn */ 6438 /* find the first valid pfn */
6439 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6439 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6440 if (pfn_valid(pfn)) 6440 if (pfn_valid(pfn))
6441 break; 6441 break;
6442 if (pfn == end_pfn) 6442 if (pfn == end_pfn)
6443 return; 6443 return;
6444 zone = page_zone(pfn_to_page(pfn)); 6444 zone = page_zone(pfn_to_page(pfn));
6445 spin_lock_irqsave(&zone->lock, flags); 6445 spin_lock_irqsave(&zone->lock, flags);
6446 pfn = start_pfn; 6446 pfn = start_pfn;
6447 while (pfn < end_pfn) { 6447 while (pfn < end_pfn) {
6448 if (!pfn_valid(pfn)) { 6448 if (!pfn_valid(pfn)) {
6449 pfn++; 6449 pfn++;
6450 continue; 6450 continue;
6451 } 6451 }
6452 page = pfn_to_page(pfn); 6452 page = pfn_to_page(pfn);
6453 /* 6453 /*
6454 * The HWPoisoned page may be not in buddy system, and 6454 * The HWPoisoned page may be not in buddy system, and
6455 * page_count() is not 0. 6455 * page_count() is not 0.
6456 */ 6456 */
6457 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6457 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6458 pfn++; 6458 pfn++;
6459 SetPageReserved(page); 6459 SetPageReserved(page);
6460 continue; 6460 continue;
6461 } 6461 }
6462 6462
6463 BUG_ON(page_count(page)); 6463 BUG_ON(page_count(page));
6464 BUG_ON(!PageBuddy(page)); 6464 BUG_ON(!PageBuddy(page));
6465 order = page_order(page); 6465 order = page_order(page);
6466 #ifdef CONFIG_DEBUG_VM 6466 #ifdef CONFIG_DEBUG_VM
6467 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6467 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6468 pfn, 1 << order, end_pfn); 6468 pfn, 1 << order, end_pfn);
6469 #endif 6469 #endif
6470 list_del(&page->lru); 6470 list_del(&page->lru);
6471 rmv_page_order(page); 6471 rmv_page_order(page);
6472 zone->free_area[order].nr_free--; 6472 zone->free_area[order].nr_free--;
6473 for (i = 0; i < (1 << order); i++) 6473 for (i = 0; i < (1 << order); i++)
6474 SetPageReserved((page+i)); 6474 SetPageReserved((page+i));
6475 pfn += (1 << order); 6475 pfn += (1 << order);
6476 } 6476 }
6477 spin_unlock_irqrestore(&zone->lock, flags); 6477 spin_unlock_irqrestore(&zone->lock, flags);
6478 } 6478 }
6479 #endif 6479 #endif
6480 6480
6481 #ifdef CONFIG_MEMORY_FAILURE 6481 #ifdef CONFIG_MEMORY_FAILURE
6482 bool is_free_buddy_page(struct page *page) 6482 bool is_free_buddy_page(struct page *page)
6483 { 6483 {
6484 struct zone *zone = page_zone(page); 6484 struct zone *zone = page_zone(page);
6485 unsigned long pfn = page_to_pfn(page); 6485 unsigned long pfn = page_to_pfn(page);
6486 unsigned long flags; 6486 unsigned long flags;
6487 unsigned int order; 6487 unsigned int order;
6488 6488
6489 spin_lock_irqsave(&zone->lock, flags); 6489 spin_lock_irqsave(&zone->lock, flags);
6490 for (order = 0; order < MAX_ORDER; order++) { 6490 for (order = 0; order < MAX_ORDER; order++) {
6491 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6491 struct page *page_head = page - (pfn & ((1 << order) - 1));
6492 6492
6493 if (PageBuddy(page_head) && page_order(page_head) >= order) 6493 if (PageBuddy(page_head) && page_order(page_head) >= order)
6494 break; 6494 break;
6495 } 6495 }
6496 spin_unlock_irqrestore(&zone->lock, flags); 6496 spin_unlock_irqrestore(&zone->lock, flags);
6497 6497
6498 return order < MAX_ORDER; 6498 return order < MAX_ORDER;
6499 } 6499 }
6500 #endif 6500 #endif
6501 6501
6502 static const struct trace_print_flags pageflag_names[] = { 6502 static const struct trace_print_flags pageflag_names[] = {
6503 {1UL << PG_locked, "locked" }, 6503 {1UL << PG_locked, "locked" },
6504 {1UL << PG_error, "error" }, 6504 {1UL << PG_error, "error" },
6505 {1UL << PG_referenced, "referenced" }, 6505 {1UL << PG_referenced, "referenced" },
6506 {1UL << PG_uptodate, "uptodate" }, 6506 {1UL << PG_uptodate, "uptodate" },
6507 {1UL << PG_dirty, "dirty" }, 6507 {1UL << PG_dirty, "dirty" },
6508 {1UL << PG_lru, "lru" }, 6508 {1UL << PG_lru, "lru" },
6509 {1UL << PG_active, "active" }, 6509 {1UL << PG_active, "active" },
6510 {1UL << PG_slab, "slab" }, 6510 {1UL << PG_slab, "slab" },
6511 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6511 {1UL << PG_owner_priv_1, "owner_priv_1" },
6512 {1UL << PG_arch_1, "arch_1" }, 6512 {1UL << PG_arch_1, "arch_1" },
6513 {1UL << PG_reserved, "reserved" }, 6513 {1UL << PG_reserved, "reserved" },
6514 {1UL << PG_private, "private" }, 6514 {1UL << PG_private, "private" },
6515 {1UL << PG_private_2, "private_2" }, 6515 {1UL << PG_private_2, "private_2" },
6516 {1UL << PG_writeback, "writeback" }, 6516 {1UL << PG_writeback, "writeback" },
6517 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6517 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6518 {1UL << PG_head, "head" }, 6518 {1UL << PG_head, "head" },
6519 {1UL << PG_tail, "tail" }, 6519 {1UL << PG_tail, "tail" },
6520 #else 6520 #else
6521 {1UL << PG_compound, "compound" }, 6521 {1UL << PG_compound, "compound" },
6522 #endif 6522 #endif
6523 {1UL << PG_swapcache, "swapcache" }, 6523 {1UL << PG_swapcache, "swapcache" },
6524 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6524 {1UL << PG_mappedtodisk, "mappedtodisk" },
6525 {1UL << PG_reclaim, "reclaim" }, 6525 {1UL << PG_reclaim, "reclaim" },
6526 {1UL << PG_swapbacked, "swapbacked" }, 6526 {1UL << PG_swapbacked, "swapbacked" },
6527 {1UL << PG_unevictable, "unevictable" }, 6527 {1UL << PG_unevictable, "unevictable" },
6528 #ifdef CONFIG_MMU 6528 #ifdef CONFIG_MMU
6529 {1UL << PG_mlocked, "mlocked" }, 6529 {1UL << PG_mlocked, "mlocked" },
6530 #endif 6530 #endif
6531 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6531 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6532 {1UL << PG_uncached, "uncached" }, 6532 {1UL << PG_uncached, "uncached" },
6533 #endif 6533 #endif
6534 #ifdef CONFIG_MEMORY_FAILURE 6534 #ifdef CONFIG_MEMORY_FAILURE
6535 {1UL << PG_hwpoison, "hwpoison" }, 6535 {1UL << PG_hwpoison, "hwpoison" },
6536 #endif 6536 #endif
6537 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6537 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6538 {1UL << PG_compound_lock, "compound_lock" }, 6538 {1UL << PG_compound_lock, "compound_lock" },
6539 #endif 6539 #endif
6540 }; 6540 };
6541 6541
6542 static void dump_page_flags(unsigned long flags) 6542 static void dump_page_flags(unsigned long flags)
6543 { 6543 {
6544 const char *delim = ""; 6544 const char *delim = "";
6545 unsigned long mask; 6545 unsigned long mask;
6546 int i; 6546 int i;
6547 6547
6548 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6548 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6549 6549
6550 printk(KERN_ALERT "page flags: %#lx(", flags); 6550 printk(KERN_ALERT "page flags: %#lx(", flags);
6551 6551
6552 /* remove zone id */ 6552 /* remove zone id */
6553 flags &= (1UL << NR_PAGEFLAGS) - 1; 6553 flags &= (1UL << NR_PAGEFLAGS) - 1;
6554 6554
6555 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6555 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6556 6556
6557 mask = pageflag_names[i].mask; 6557 mask = pageflag_names[i].mask;
6558 if ((flags & mask) != mask) 6558 if ((flags & mask) != mask)
6559 continue; 6559 continue;
6560 6560
6561 flags &= ~mask; 6561 flags &= ~mask;
6562 printk("%s%s", delim, pageflag_names[i].name); 6562 printk("%s%s", delim, pageflag_names[i].name);
6563 delim = "|"; 6563 delim = "|";
6564 } 6564 }
6565 6565
6566 /* check for left over flags */ 6566 /* check for left over flags */
6567 if (flags) 6567 if (flags)
6568 printk("%s%#lx", delim, flags); 6568 printk("%s%#lx", delim, flags);
6569 6569
6570 printk(")\n"); 6570 printk(")\n");
6571 } 6571 }
6572 6572
6573 void dump_page(struct page *page) 6573 void dump_page(struct page *page)
6574 { 6574 {
6575 printk(KERN_ALERT 6575 printk(KERN_ALERT
6576 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6576 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6577 page, atomic_read(&page->_count), page_mapcount(page), 6577 page, atomic_read(&page->_count), page_mapcount(page),
6578 page->mapping, page->index); 6578 page->mapping, page->index);
6579 dump_page_flags(page->flags); 6579 dump_page_flags(page->flags);
6580 mem_cgroup_print_bad_page(page); 6580 mem_cgroup_print_bad_page(page);
6581 } 6581 }
6582 6582