Commit f161eedc71da293a9bcfcf3d7f6c1da070a61ef0

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent 3e7379c0f4

mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free

commit cfc47a2803db42140167b92d991ef04018e162c7 upstream.

get_pageblock_migratetype() is called during free with IRQs disabled.
This is unnecessary and disables IRQs for longer than necessary.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 1 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 #define MIN_PERCPU_PAGELIST_FRACTION (8) 72 #define MIN_PERCPU_PAGELIST_FRACTION (8)
73 73
74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75 DEFINE_PER_CPU(int, numa_node); 75 DEFINE_PER_CPU(int, numa_node);
76 EXPORT_PER_CPU_SYMBOL(numa_node); 76 EXPORT_PER_CPU_SYMBOL(numa_node);
77 #endif 77 #endif
78 78
79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
80 /* 80 /*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>. 84 * defined in <linux/topology.h>.
85 */ 85 */
86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 87 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
88 #endif 88 #endif
89 89
90 /* 90 /*
91 * Array of node states. 91 * Array of node states.
92 */ 92 */
93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
94 [N_POSSIBLE] = NODE_MASK_ALL, 94 [N_POSSIBLE] = NODE_MASK_ALL,
95 [N_ONLINE] = { { [0] = 1UL } }, 95 [N_ONLINE] = { { [0] = 1UL } },
96 #ifndef CONFIG_NUMA 96 #ifndef CONFIG_NUMA
97 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 97 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
98 #ifdef CONFIG_HIGHMEM 98 #ifdef CONFIG_HIGHMEM
99 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 99 [N_HIGH_MEMORY] = { { [0] = 1UL } },
100 #endif 100 #endif
101 #ifdef CONFIG_MOVABLE_NODE 101 #ifdef CONFIG_MOVABLE_NODE
102 [N_MEMORY] = { { [0] = 1UL } }, 102 [N_MEMORY] = { { [0] = 1UL } },
103 #endif 103 #endif
104 [N_CPU] = { { [0] = 1UL } }, 104 [N_CPU] = { { [0] = 1UL } },
105 #endif /* NUMA */ 105 #endif /* NUMA */
106 }; 106 };
107 EXPORT_SYMBOL(node_states); 107 EXPORT_SYMBOL(node_states);
108 108
109 /* Protect totalram_pages and zone->managed_pages */ 109 /* Protect totalram_pages and zone->managed_pages */
110 static DEFINE_SPINLOCK(managed_page_count_lock); 110 static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112 unsigned long totalram_pages __read_mostly; 112 unsigned long totalram_pages __read_mostly;
113 unsigned long totalreserve_pages __read_mostly; 113 unsigned long totalreserve_pages __read_mostly;
114 /* 114 /*
115 * When calculating the number of globally allowed dirty pages, there 115 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 116 * is a certain number of per-zone reserves that should not be
117 * considered dirtyable memory. This is the sum of those reserves 117 * considered dirtyable memory. This is the sum of those reserves
118 * over all existing zones that contribute dirtyable memory. 118 * over all existing zones that contribute dirtyable memory.
119 */ 119 */
120 unsigned long dirty_balance_reserve __read_mostly; 120 unsigned long dirty_balance_reserve __read_mostly;
121 121
122 int percpu_pagelist_fraction; 122 int percpu_pagelist_fraction;
123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
124 124
125 #ifdef CONFIG_PM_SLEEP 125 #ifdef CONFIG_PM_SLEEP
126 /* 126 /*
127 * The following functions are used by the suspend/hibernate code to temporarily 127 * The following functions are used by the suspend/hibernate code to temporarily
128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
129 * while devices are suspended. To avoid races with the suspend/hibernate code, 129 * while devices are suspended. To avoid races with the suspend/hibernate code,
130 * they should always be called with pm_mutex held (gfp_allowed_mask also should 130 * they should always be called with pm_mutex held (gfp_allowed_mask also should
131 * only be modified with pm_mutex held, unless the suspend/hibernate code is 131 * only be modified with pm_mutex held, unless the suspend/hibernate code is
132 * guaranteed not to run in parallel with that modification). 132 * guaranteed not to run in parallel with that modification).
133 */ 133 */
134 134
135 static gfp_t saved_gfp_mask; 135 static gfp_t saved_gfp_mask;
136 136
137 void pm_restore_gfp_mask(void) 137 void pm_restore_gfp_mask(void)
138 { 138 {
139 WARN_ON(!mutex_is_locked(&pm_mutex)); 139 WARN_ON(!mutex_is_locked(&pm_mutex));
140 if (saved_gfp_mask) { 140 if (saved_gfp_mask) {
141 gfp_allowed_mask = saved_gfp_mask; 141 gfp_allowed_mask = saved_gfp_mask;
142 saved_gfp_mask = 0; 142 saved_gfp_mask = 0;
143 } 143 }
144 } 144 }
145 145
146 void pm_restrict_gfp_mask(void) 146 void pm_restrict_gfp_mask(void)
147 { 147 {
148 WARN_ON(!mutex_is_locked(&pm_mutex)); 148 WARN_ON(!mutex_is_locked(&pm_mutex));
149 WARN_ON(saved_gfp_mask); 149 WARN_ON(saved_gfp_mask);
150 saved_gfp_mask = gfp_allowed_mask; 150 saved_gfp_mask = gfp_allowed_mask;
151 gfp_allowed_mask &= ~GFP_IOFS; 151 gfp_allowed_mask &= ~GFP_IOFS;
152 } 152 }
153 153
154 bool pm_suspended_storage(void) 154 bool pm_suspended_storage(void)
155 { 155 {
156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
157 return false; 157 return false;
158 return true; 158 return true;
159 } 159 }
160 #endif /* CONFIG_PM_SLEEP */ 160 #endif /* CONFIG_PM_SLEEP */
161 161
162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
163 int pageblock_order __read_mostly; 163 int pageblock_order __read_mostly;
164 #endif 164 #endif
165 165
166 static void __free_pages_ok(struct page *page, unsigned int order); 166 static void __free_pages_ok(struct page *page, unsigned int order);
167 167
168 /* 168 /*
169 * results with 256, 32 in the lowmem_reserve sysctl: 169 * results with 256, 32 in the lowmem_reserve sysctl:
170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
171 * 1G machine -> (16M dma, 784M normal, 224M high) 171 * 1G machine -> (16M dma, 784M normal, 224M high)
172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
175 * 175 *
176 * TBD: should special case ZONE_DMA32 machines here - in those we normally 176 * TBD: should special case ZONE_DMA32 machines here - in those we normally
177 * don't need any ZONE_NORMAL reservation 177 * don't need any ZONE_NORMAL reservation
178 */ 178 */
179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
180 #ifdef CONFIG_ZONE_DMA 180 #ifdef CONFIG_ZONE_DMA
181 256, 181 256,
182 #endif 182 #endif
183 #ifdef CONFIG_ZONE_DMA32 183 #ifdef CONFIG_ZONE_DMA32
184 256, 184 256,
185 #endif 185 #endif
186 #ifdef CONFIG_HIGHMEM 186 #ifdef CONFIG_HIGHMEM
187 32, 187 32,
188 #endif 188 #endif
189 32, 189 32,
190 }; 190 };
191 191
192 EXPORT_SYMBOL(totalram_pages); 192 EXPORT_SYMBOL(totalram_pages);
193 193
194 static char * const zone_names[MAX_NR_ZONES] = { 194 static char * const zone_names[MAX_NR_ZONES] = {
195 #ifdef CONFIG_ZONE_DMA 195 #ifdef CONFIG_ZONE_DMA
196 "DMA", 196 "DMA",
197 #endif 197 #endif
198 #ifdef CONFIG_ZONE_DMA32 198 #ifdef CONFIG_ZONE_DMA32
199 "DMA32", 199 "DMA32",
200 #endif 200 #endif
201 "Normal", 201 "Normal",
202 #ifdef CONFIG_HIGHMEM 202 #ifdef CONFIG_HIGHMEM
203 "HighMem", 203 "HighMem",
204 #endif 204 #endif
205 "Movable", 205 "Movable",
206 }; 206 };
207 207
208 int min_free_kbytes = 1024; 208 int min_free_kbytes = 1024;
209 int user_min_free_kbytes; 209 int user_min_free_kbytes;
210 210
211 static unsigned long __meminitdata nr_kernel_pages; 211 static unsigned long __meminitdata nr_kernel_pages;
212 static unsigned long __meminitdata nr_all_pages; 212 static unsigned long __meminitdata nr_all_pages;
213 static unsigned long __meminitdata dma_reserve; 213 static unsigned long __meminitdata dma_reserve;
214 214
215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
218 static unsigned long __initdata required_kernelcore; 218 static unsigned long __initdata required_kernelcore;
219 static unsigned long __initdata required_movablecore; 219 static unsigned long __initdata required_movablecore;
220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
221 221
222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
223 int movable_zone; 223 int movable_zone;
224 EXPORT_SYMBOL(movable_zone); 224 EXPORT_SYMBOL(movable_zone);
225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
226 226
227 #if MAX_NUMNODES > 1 227 #if MAX_NUMNODES > 1
228 int nr_node_ids __read_mostly = MAX_NUMNODES; 228 int nr_node_ids __read_mostly = MAX_NUMNODES;
229 int nr_online_nodes __read_mostly = 1; 229 int nr_online_nodes __read_mostly = 1;
230 EXPORT_SYMBOL(nr_node_ids); 230 EXPORT_SYMBOL(nr_node_ids);
231 EXPORT_SYMBOL(nr_online_nodes); 231 EXPORT_SYMBOL(nr_online_nodes);
232 #endif 232 #endif
233 233
234 int page_group_by_mobility_disabled __read_mostly; 234 int page_group_by_mobility_disabled __read_mostly;
235 235
236 void set_pageblock_migratetype(struct page *page, int migratetype) 236 void set_pageblock_migratetype(struct page *page, int migratetype)
237 { 237 {
238 238
239 if (unlikely(page_group_by_mobility_disabled)) 239 if (unlikely(page_group_by_mobility_disabled))
240 migratetype = MIGRATE_UNMOVABLE; 240 migratetype = MIGRATE_UNMOVABLE;
241 241
242 set_pageblock_flags_group(page, (unsigned long)migratetype, 242 set_pageblock_flags_group(page, (unsigned long)migratetype,
243 PB_migrate, PB_migrate_end); 243 PB_migrate, PB_migrate_end);
244 } 244 }
245 245
246 bool oom_killer_disabled __read_mostly; 246 bool oom_killer_disabled __read_mostly;
247 247
248 #ifdef CONFIG_DEBUG_VM 248 #ifdef CONFIG_DEBUG_VM
249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
250 { 250 {
251 int ret = 0; 251 int ret = 0;
252 unsigned seq; 252 unsigned seq;
253 unsigned long pfn = page_to_pfn(page); 253 unsigned long pfn = page_to_pfn(page);
254 unsigned long sp, start_pfn; 254 unsigned long sp, start_pfn;
255 255
256 do { 256 do {
257 seq = zone_span_seqbegin(zone); 257 seq = zone_span_seqbegin(zone);
258 start_pfn = zone->zone_start_pfn; 258 start_pfn = zone->zone_start_pfn;
259 sp = zone->spanned_pages; 259 sp = zone->spanned_pages;
260 if (!zone_spans_pfn(zone, pfn)) 260 if (!zone_spans_pfn(zone, pfn))
261 ret = 1; 261 ret = 1;
262 } while (zone_span_seqretry(zone, seq)); 262 } while (zone_span_seqretry(zone, seq));
263 263
264 if (ret) 264 if (ret)
265 pr_err("page %lu outside zone [ %lu - %lu ]\n", 265 pr_err("page %lu outside zone [ %lu - %lu ]\n",
266 pfn, start_pfn, start_pfn + sp); 266 pfn, start_pfn, start_pfn + sp);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 static int page_is_consistent(struct zone *zone, struct page *page) 271 static int page_is_consistent(struct zone *zone, struct page *page)
272 { 272 {
273 if (!pfn_valid_within(page_to_pfn(page))) 273 if (!pfn_valid_within(page_to_pfn(page)))
274 return 0; 274 return 0;
275 if (zone != page_zone(page)) 275 if (zone != page_zone(page))
276 return 0; 276 return 0;
277 277
278 return 1; 278 return 1;
279 } 279 }
280 /* 280 /*
281 * Temporary debugging check for pages not lying within a given zone. 281 * Temporary debugging check for pages not lying within a given zone.
282 */ 282 */
283 static int bad_range(struct zone *zone, struct page *page) 283 static int bad_range(struct zone *zone, struct page *page)
284 { 284 {
285 if (page_outside_zone_boundaries(zone, page)) 285 if (page_outside_zone_boundaries(zone, page))
286 return 1; 286 return 1;
287 if (!page_is_consistent(zone, page)) 287 if (!page_is_consistent(zone, page))
288 return 1; 288 return 1;
289 289
290 return 0; 290 return 0;
291 } 291 }
292 #else 292 #else
293 static inline int bad_range(struct zone *zone, struct page *page) 293 static inline int bad_range(struct zone *zone, struct page *page)
294 { 294 {
295 return 0; 295 return 0;
296 } 296 }
297 #endif 297 #endif
298 298
299 static void bad_page(struct page *page) 299 static void bad_page(struct page *page)
300 { 300 {
301 static unsigned long resume; 301 static unsigned long resume;
302 static unsigned long nr_shown; 302 static unsigned long nr_shown;
303 static unsigned long nr_unshown; 303 static unsigned long nr_unshown;
304 304
305 /* Don't complain about poisoned pages */ 305 /* Don't complain about poisoned pages */
306 if (PageHWPoison(page)) { 306 if (PageHWPoison(page)) {
307 page_mapcount_reset(page); /* remove PageBuddy */ 307 page_mapcount_reset(page); /* remove PageBuddy */
308 return; 308 return;
309 } 309 }
310 310
311 /* 311 /*
312 * Allow a burst of 60 reports, then keep quiet for that minute; 312 * Allow a burst of 60 reports, then keep quiet for that minute;
313 * or allow a steady drip of one report per second. 313 * or allow a steady drip of one report per second.
314 */ 314 */
315 if (nr_shown == 60) { 315 if (nr_shown == 60) {
316 if (time_before(jiffies, resume)) { 316 if (time_before(jiffies, resume)) {
317 nr_unshown++; 317 nr_unshown++;
318 goto out; 318 goto out;
319 } 319 }
320 if (nr_unshown) { 320 if (nr_unshown) {
321 printk(KERN_ALERT 321 printk(KERN_ALERT
322 "BUG: Bad page state: %lu messages suppressed\n", 322 "BUG: Bad page state: %lu messages suppressed\n",
323 nr_unshown); 323 nr_unshown);
324 nr_unshown = 0; 324 nr_unshown = 0;
325 } 325 }
326 nr_shown = 0; 326 nr_shown = 0;
327 } 327 }
328 if (nr_shown++ == 0) 328 if (nr_shown++ == 0)
329 resume = jiffies + 60 * HZ; 329 resume = jiffies + 60 * HZ;
330 330
331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
332 current->comm, page_to_pfn(page)); 332 current->comm, page_to_pfn(page));
333 dump_page(page); 333 dump_page(page);
334 334
335 print_modules(); 335 print_modules();
336 dump_stack(); 336 dump_stack();
337 out: 337 out:
338 /* Leave bad fields for debug, except PageBuddy could make trouble */ 338 /* Leave bad fields for debug, except PageBuddy could make trouble */
339 page_mapcount_reset(page); /* remove PageBuddy */ 339 page_mapcount_reset(page); /* remove PageBuddy */
340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
341 } 341 }
342 342
343 /* 343 /*
344 * Higher-order pages are called "compound pages". They are structured thusly: 344 * Higher-order pages are called "compound pages". They are structured thusly:
345 * 345 *
346 * The first PAGE_SIZE page is called the "head page". 346 * The first PAGE_SIZE page is called the "head page".
347 * 347 *
348 * The remaining PAGE_SIZE pages are called "tail pages". 348 * The remaining PAGE_SIZE pages are called "tail pages".
349 * 349 *
350 * All pages have PG_compound set. All tail pages have their ->first_page 350 * All pages have PG_compound set. All tail pages have their ->first_page
351 * pointing at the head page. 351 * pointing at the head page.
352 * 352 *
353 * The first tail page's ->lru.next holds the address of the compound page's 353 * The first tail page's ->lru.next holds the address of the compound page's
354 * put_page() function. Its ->lru.prev holds the order of allocation. 354 * put_page() function. Its ->lru.prev holds the order of allocation.
355 * This usage means that zero-order pages may not be compound. 355 * This usage means that zero-order pages may not be compound.
356 */ 356 */
357 357
358 static void free_compound_page(struct page *page) 358 static void free_compound_page(struct page *page)
359 { 359 {
360 __free_pages_ok(page, compound_order(page)); 360 __free_pages_ok(page, compound_order(page));
361 } 361 }
362 362
363 void prep_compound_page(struct page *page, unsigned long order) 363 void prep_compound_page(struct page *page, unsigned long order)
364 { 364 {
365 int i; 365 int i;
366 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
367 367
368 set_compound_page_dtor(page, free_compound_page); 368 set_compound_page_dtor(page, free_compound_page);
369 set_compound_order(page, order); 369 set_compound_order(page, order);
370 __SetPageHead(page); 370 __SetPageHead(page);
371 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
372 struct page *p = page + i; 372 struct page *p = page + i;
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 /* Make sure p->first_page is always valid for PageTail() */ 375 /* Make sure p->first_page is always valid for PageTail() */
376 smp_wmb(); 376 smp_wmb();
377 __SetPageTail(p); 377 __SetPageTail(p);
378 } 378 }
379 } 379 }
380 380
381 /* update __split_huge_page_refcount if you change this function */ 381 /* update __split_huge_page_refcount if you change this function */
382 static int destroy_compound_page(struct page *page, unsigned long order) 382 static int destroy_compound_page(struct page *page, unsigned long order)
383 { 383 {
384 int i; 384 int i;
385 int nr_pages = 1 << order; 385 int nr_pages = 1 << order;
386 int bad = 0; 386 int bad = 0;
387 387
388 if (unlikely(compound_order(page) != order)) { 388 if (unlikely(compound_order(page) != order)) {
389 bad_page(page); 389 bad_page(page);
390 bad++; 390 bad++;
391 } 391 }
392 392
393 __ClearPageHead(page); 393 __ClearPageHead(page);
394 394
395 for (i = 1; i < nr_pages; i++) { 395 for (i = 1; i < nr_pages; i++) {
396 struct page *p = page + i; 396 struct page *p = page + i;
397 397
398 if (unlikely(!PageTail(p) || (p->first_page != page))) { 398 if (unlikely(!PageTail(p) || (p->first_page != page))) {
399 bad_page(page); 399 bad_page(page);
400 bad++; 400 bad++;
401 } 401 }
402 __ClearPageTail(p); 402 __ClearPageTail(p);
403 } 403 }
404 404
405 return bad; 405 return bad;
406 } 406 }
407 407
408 static inline void prep_zero_page(struct page *page, unsigned int order, 408 static inline void prep_zero_page(struct page *page, unsigned int order,
409 gfp_t gfp_flags) 409 gfp_t gfp_flags)
410 { 410 {
411 int i; 411 int i;
412 412
413 /* 413 /*
414 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 414 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
415 * and __GFP_HIGHMEM from hard or soft interrupt context. 415 * and __GFP_HIGHMEM from hard or soft interrupt context.
416 */ 416 */
417 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 417 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
418 for (i = 0; i < (1 << order); i++) 418 for (i = 0; i < (1 << order); i++)
419 clear_highpage(page + i); 419 clear_highpage(page + i);
420 } 420 }
421 421
422 #ifdef CONFIG_DEBUG_PAGEALLOC 422 #ifdef CONFIG_DEBUG_PAGEALLOC
423 unsigned int _debug_guardpage_minorder; 423 unsigned int _debug_guardpage_minorder;
424 424
425 static int __init debug_guardpage_minorder_setup(char *buf) 425 static int __init debug_guardpage_minorder_setup(char *buf)
426 { 426 {
427 unsigned long res; 427 unsigned long res;
428 428
429 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 429 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
430 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 430 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
431 return 0; 431 return 0;
432 } 432 }
433 _debug_guardpage_minorder = res; 433 _debug_guardpage_minorder = res;
434 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 434 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
435 return 0; 435 return 0;
436 } 436 }
437 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 437 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
438 438
439 static inline void set_page_guard_flag(struct page *page) 439 static inline void set_page_guard_flag(struct page *page)
440 { 440 {
441 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 441 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
442 } 442 }
443 443
444 static inline void clear_page_guard_flag(struct page *page) 444 static inline void clear_page_guard_flag(struct page *page)
445 { 445 {
446 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 446 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
447 } 447 }
448 #else 448 #else
449 static inline void set_page_guard_flag(struct page *page) { } 449 static inline void set_page_guard_flag(struct page *page) { }
450 static inline void clear_page_guard_flag(struct page *page) { } 450 static inline void clear_page_guard_flag(struct page *page) { }
451 #endif 451 #endif
452 452
453 static inline void set_page_order(struct page *page, unsigned int order) 453 static inline void set_page_order(struct page *page, unsigned int order)
454 { 454 {
455 set_page_private(page, order); 455 set_page_private(page, order);
456 __SetPageBuddy(page); 456 __SetPageBuddy(page);
457 } 457 }
458 458
459 static inline void rmv_page_order(struct page *page) 459 static inline void rmv_page_order(struct page *page)
460 { 460 {
461 __ClearPageBuddy(page); 461 __ClearPageBuddy(page);
462 set_page_private(page, 0); 462 set_page_private(page, 0);
463 } 463 }
464 464
465 /* 465 /*
466 * Locate the struct page for both the matching buddy in our 466 * Locate the struct page for both the matching buddy in our
467 * pair (buddy1) and the combined O(n+1) page they form (page). 467 * pair (buddy1) and the combined O(n+1) page they form (page).
468 * 468 *
469 * 1) Any buddy B1 will have an order O twin B2 which satisfies 469 * 1) Any buddy B1 will have an order O twin B2 which satisfies
470 * the following equation: 470 * the following equation:
471 * B2 = B1 ^ (1 << O) 471 * B2 = B1 ^ (1 << O)
472 * For example, if the starting buddy (buddy2) is #8 its order 472 * For example, if the starting buddy (buddy2) is #8 its order
473 * 1 buddy is #10: 473 * 1 buddy is #10:
474 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 474 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
475 * 475 *
476 * 2) Any buddy B will have an order O+1 parent P which 476 * 2) Any buddy B will have an order O+1 parent P which
477 * satisfies the following equation: 477 * satisfies the following equation:
478 * P = B & ~(1 << O) 478 * P = B & ~(1 << O)
479 * 479 *
480 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 480 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
481 */ 481 */
482 static inline unsigned long 482 static inline unsigned long
483 __find_buddy_index(unsigned long page_idx, unsigned int order) 483 __find_buddy_index(unsigned long page_idx, unsigned int order)
484 { 484 {
485 return page_idx ^ (1 << order); 485 return page_idx ^ (1 << order);
486 } 486 }
487 487
488 /* 488 /*
489 * This function checks whether a page is free && is the buddy 489 * This function checks whether a page is free && is the buddy
490 * we can do coalesce a page and its buddy if 490 * we can do coalesce a page and its buddy if
491 * (a) the buddy is not in a hole && 491 * (a) the buddy is not in a hole &&
492 * (b) the buddy is in the buddy system && 492 * (b) the buddy is in the buddy system &&
493 * (c) a page and its buddy have the same order && 493 * (c) a page and its buddy have the same order &&
494 * (d) a page and its buddy are in the same zone. 494 * (d) a page and its buddy are in the same zone.
495 * 495 *
496 * For recording whether a page is in the buddy system, we set ->_mapcount 496 * For recording whether a page is in the buddy system, we set ->_mapcount
497 * PAGE_BUDDY_MAPCOUNT_VALUE. 497 * PAGE_BUDDY_MAPCOUNT_VALUE.
498 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 498 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
499 * serialized by zone->lock. 499 * serialized by zone->lock.
500 * 500 *
501 * For recording page's order, we use page_private(page). 501 * For recording page's order, we use page_private(page).
502 */ 502 */
503 static inline int page_is_buddy(struct page *page, struct page *buddy, 503 static inline int page_is_buddy(struct page *page, struct page *buddy,
504 unsigned int order) 504 unsigned int order)
505 { 505 {
506 if (!pfn_valid_within(page_to_pfn(buddy))) 506 if (!pfn_valid_within(page_to_pfn(buddy)))
507 return 0; 507 return 0;
508 508
509 if (page_is_guard(buddy) && page_order(buddy) == order) { 509 if (page_is_guard(buddy) && page_order(buddy) == order) {
510 VM_BUG_ON(page_count(buddy) != 0); 510 VM_BUG_ON(page_count(buddy) != 0);
511 511
512 if (page_zone_id(page) != page_zone_id(buddy)) 512 if (page_zone_id(page) != page_zone_id(buddy))
513 return 0; 513 return 0;
514 514
515 return 1; 515 return 1;
516 } 516 }
517 517
518 if (PageBuddy(buddy) && page_order(buddy) == order) { 518 if (PageBuddy(buddy) && page_order(buddy) == order) {
519 VM_BUG_ON(page_count(buddy) != 0); 519 VM_BUG_ON(page_count(buddy) != 0);
520 520
521 /* 521 /*
522 * zone check is done late to avoid uselessly 522 * zone check is done late to avoid uselessly
523 * calculating zone/node ids for pages that could 523 * calculating zone/node ids for pages that could
524 * never merge. 524 * never merge.
525 */ 525 */
526 if (page_zone_id(page) != page_zone_id(buddy)) 526 if (page_zone_id(page) != page_zone_id(buddy))
527 return 0; 527 return 0;
528 528
529 return 1; 529 return 1;
530 } 530 }
531 return 0; 531 return 0;
532 } 532 }
533 533
534 /* 534 /*
535 * Freeing function for a buddy system allocator. 535 * Freeing function for a buddy system allocator.
536 * 536 *
537 * The concept of a buddy system is to maintain direct-mapped table 537 * The concept of a buddy system is to maintain direct-mapped table
538 * (containing bit values) for memory blocks of various "orders". 538 * (containing bit values) for memory blocks of various "orders".
539 * The bottom level table contains the map for the smallest allocatable 539 * The bottom level table contains the map for the smallest allocatable
540 * units of memory (here, pages), and each level above it describes 540 * units of memory (here, pages), and each level above it describes
541 * pairs of units from the levels below, hence, "buddies". 541 * pairs of units from the levels below, hence, "buddies".
542 * At a high level, all that happens here is marking the table entry 542 * At a high level, all that happens here is marking the table entry
543 * at the bottom level available, and propagating the changes upward 543 * at the bottom level available, and propagating the changes upward
544 * as necessary, plus some accounting needed to play nicely with other 544 * as necessary, plus some accounting needed to play nicely with other
545 * parts of the VM system. 545 * parts of the VM system.
546 * At each level, we keep a list of pages, which are heads of continuous 546 * At each level, we keep a list of pages, which are heads of continuous
547 * free pages of length of (1 << order) and marked with _mapcount 547 * free pages of length of (1 << order) and marked with _mapcount
548 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 548 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
549 * field. 549 * field.
550 * So when we are allocating or freeing one, we can derive the state of the 550 * So when we are allocating or freeing one, we can derive the state of the
551 * other. That is, if we allocate a small block, and both were 551 * other. That is, if we allocate a small block, and both were
552 * free, the remainder of the region must be split into blocks. 552 * free, the remainder of the region must be split into blocks.
553 * If a block is freed, and its buddy is also free, then this 553 * If a block is freed, and its buddy is also free, then this
554 * triggers coalescing into a block of larger size. 554 * triggers coalescing into a block of larger size.
555 * 555 *
556 * -- nyc 556 * -- nyc
557 */ 557 */
558 558
559 static inline void __free_one_page(struct page *page, 559 static inline void __free_one_page(struct page *page,
560 unsigned long pfn, 560 unsigned long pfn,
561 struct zone *zone, unsigned int order, 561 struct zone *zone, unsigned int order,
562 int migratetype) 562 int migratetype)
563 { 563 {
564 unsigned long page_idx; 564 unsigned long page_idx;
565 unsigned long combined_idx; 565 unsigned long combined_idx;
566 unsigned long uninitialized_var(buddy_idx); 566 unsigned long uninitialized_var(buddy_idx);
567 struct page *buddy; 567 struct page *buddy;
568 568
569 VM_BUG_ON(!zone_is_initialized(zone)); 569 VM_BUG_ON(!zone_is_initialized(zone));
570 570
571 if (unlikely(PageCompound(page))) 571 if (unlikely(PageCompound(page)))
572 if (unlikely(destroy_compound_page(page, order))) 572 if (unlikely(destroy_compound_page(page, order)))
573 return; 573 return;
574 574
575 VM_BUG_ON(migratetype == -1); 575 VM_BUG_ON(migratetype == -1);
576 576
577 page_idx = pfn & ((1 << MAX_ORDER) - 1); 577 page_idx = pfn & ((1 << MAX_ORDER) - 1);
578 578
579 VM_BUG_ON(page_idx & ((1 << order) - 1)); 579 VM_BUG_ON(page_idx & ((1 << order) - 1));
580 VM_BUG_ON(bad_range(zone, page)); 580 VM_BUG_ON(bad_range(zone, page));
581 581
582 while (order < MAX_ORDER-1) { 582 while (order < MAX_ORDER-1) {
583 buddy_idx = __find_buddy_index(page_idx, order); 583 buddy_idx = __find_buddy_index(page_idx, order);
584 buddy = page + (buddy_idx - page_idx); 584 buddy = page + (buddy_idx - page_idx);
585 if (!page_is_buddy(page, buddy, order)) 585 if (!page_is_buddy(page, buddy, order))
586 break; 586 break;
587 /* 587 /*
588 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 588 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
589 * merge with it and move up one order. 589 * merge with it and move up one order.
590 */ 590 */
591 if (page_is_guard(buddy)) { 591 if (page_is_guard(buddy)) {
592 clear_page_guard_flag(buddy); 592 clear_page_guard_flag(buddy);
593 set_page_private(page, 0); 593 set_page_private(page, 0);
594 __mod_zone_freepage_state(zone, 1 << order, 594 __mod_zone_freepage_state(zone, 1 << order,
595 migratetype); 595 migratetype);
596 } else { 596 } else {
597 list_del(&buddy->lru); 597 list_del(&buddy->lru);
598 zone->free_area[order].nr_free--; 598 zone->free_area[order].nr_free--;
599 rmv_page_order(buddy); 599 rmv_page_order(buddy);
600 } 600 }
601 combined_idx = buddy_idx & page_idx; 601 combined_idx = buddy_idx & page_idx;
602 page = page + (combined_idx - page_idx); 602 page = page + (combined_idx - page_idx);
603 page_idx = combined_idx; 603 page_idx = combined_idx;
604 order++; 604 order++;
605 } 605 }
606 set_page_order(page, order); 606 set_page_order(page, order);
607 607
608 /* 608 /*
609 * If this is not the largest possible page, check if the buddy 609 * If this is not the largest possible page, check if the buddy
610 * of the next-highest order is free. If it is, it's possible 610 * of the next-highest order is free. If it is, it's possible
611 * that pages are being freed that will coalesce soon. In case, 611 * that pages are being freed that will coalesce soon. In case,
612 * that is happening, add the free page to the tail of the list 612 * that is happening, add the free page to the tail of the list
613 * so it's less likely to be used soon and more likely to be merged 613 * so it's less likely to be used soon and more likely to be merged
614 * as a higher order page 614 * as a higher order page
615 */ 615 */
616 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 616 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
617 struct page *higher_page, *higher_buddy; 617 struct page *higher_page, *higher_buddy;
618 combined_idx = buddy_idx & page_idx; 618 combined_idx = buddy_idx & page_idx;
619 higher_page = page + (combined_idx - page_idx); 619 higher_page = page + (combined_idx - page_idx);
620 buddy_idx = __find_buddy_index(combined_idx, order + 1); 620 buddy_idx = __find_buddy_index(combined_idx, order + 1);
621 higher_buddy = higher_page + (buddy_idx - combined_idx); 621 higher_buddy = higher_page + (buddy_idx - combined_idx);
622 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 622 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
623 list_add_tail(&page->lru, 623 list_add_tail(&page->lru,
624 &zone->free_area[order].free_list[migratetype]); 624 &zone->free_area[order].free_list[migratetype]);
625 goto out; 625 goto out;
626 } 626 }
627 } 627 }
628 628
629 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 629 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
630 out: 630 out:
631 zone->free_area[order].nr_free++; 631 zone->free_area[order].nr_free++;
632 } 632 }
633 633
634 static inline int free_pages_check(struct page *page) 634 static inline int free_pages_check(struct page *page)
635 { 635 {
636 if (unlikely(page_mapcount(page) | 636 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 637 (page->mapping != NULL) |
638 (atomic_read(&page->_count) != 0) | 638 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 639 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
640 (mem_cgroup_bad_page_check(page)))) { 640 (mem_cgroup_bad_page_check(page)))) {
641 bad_page(page); 641 bad_page(page);
642 return 1; 642 return 1;
643 } 643 }
644 page_nid_reset_last(page); 644 page_nid_reset_last(page);
645 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 645 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
646 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 646 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
647 return 0; 647 return 0;
648 } 648 }
649 649
650 /* 650 /*
651 * Frees a number of pages from the PCP lists 651 * Frees a number of pages from the PCP lists
652 * Assumes all pages on list are in same zone, and of same order. 652 * Assumes all pages on list are in same zone, and of same order.
653 * count is the number of pages to free. 653 * count is the number of pages to free.
654 * 654 *
655 * If the zone was previously in an "all pages pinned" state then look to 655 * If the zone was previously in an "all pages pinned" state then look to
656 * see if this freeing clears that state. 656 * see if this freeing clears that state.
657 * 657 *
658 * And clear the zone's pages_scanned counter, to hold off the "all pages are 658 * And clear the zone's pages_scanned counter, to hold off the "all pages are
659 * pinned" detection logic. 659 * pinned" detection logic.
660 */ 660 */
661 static void free_pcppages_bulk(struct zone *zone, int count, 661 static void free_pcppages_bulk(struct zone *zone, int count,
662 struct per_cpu_pages *pcp) 662 struct per_cpu_pages *pcp)
663 { 663 {
664 int migratetype = 0; 664 int migratetype = 0;
665 int batch_free = 0; 665 int batch_free = 0;
666 int to_free = count; 666 int to_free = count;
667 667
668 spin_lock(&zone->lock); 668 spin_lock(&zone->lock);
669 zone->pages_scanned = 0; 669 zone->pages_scanned = 0;
670 670
671 while (to_free) { 671 while (to_free) {
672 struct page *page; 672 struct page *page;
673 struct list_head *list; 673 struct list_head *list;
674 674
675 /* 675 /*
676 * Remove pages from lists in a round-robin fashion. A 676 * Remove pages from lists in a round-robin fashion. A
677 * batch_free count is maintained that is incremented when an 677 * batch_free count is maintained that is incremented when an
678 * empty list is encountered. This is so more pages are freed 678 * empty list is encountered. This is so more pages are freed
679 * off fuller lists instead of spinning excessively around empty 679 * off fuller lists instead of spinning excessively around empty
680 * lists 680 * lists
681 */ 681 */
682 do { 682 do {
683 batch_free++; 683 batch_free++;
684 if (++migratetype == MIGRATE_PCPTYPES) 684 if (++migratetype == MIGRATE_PCPTYPES)
685 migratetype = 0; 685 migratetype = 0;
686 list = &pcp->lists[migratetype]; 686 list = &pcp->lists[migratetype];
687 } while (list_empty(list)); 687 } while (list_empty(list));
688 688
689 /* This is the only non-empty list. Free them all. */ 689 /* This is the only non-empty list. Free them all. */
690 if (batch_free == MIGRATE_PCPTYPES) 690 if (batch_free == MIGRATE_PCPTYPES)
691 batch_free = to_free; 691 batch_free = to_free;
692 692
693 do { 693 do {
694 int mt; /* migratetype of the to-be-freed page */ 694 int mt; /* migratetype of the to-be-freed page */
695 695
696 page = list_entry(list->prev, struct page, lru); 696 page = list_entry(list->prev, struct page, lru);
697 /* must delete as __free_one_page list manipulates */ 697 /* must delete as __free_one_page list manipulates */
698 list_del(&page->lru); 698 list_del(&page->lru);
699 mt = get_freepage_migratetype(page); 699 mt = get_freepage_migratetype(page);
700 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 700 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
701 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 701 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
702 trace_mm_page_pcpu_drain(page, 0, mt); 702 trace_mm_page_pcpu_drain(page, 0, mt);
703 if (likely(!is_migrate_isolate_page(page))) { 703 if (likely(!is_migrate_isolate_page(page))) {
704 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 704 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
705 if (is_migrate_cma(mt)) 705 if (is_migrate_cma(mt))
706 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 706 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
707 } 707 }
708 } while (--to_free && --batch_free && !list_empty(list)); 708 } while (--to_free && --batch_free && !list_empty(list));
709 } 709 }
710 spin_unlock(&zone->lock); 710 spin_unlock(&zone->lock);
711 } 711 }
712 712
713 static void free_one_page(struct zone *zone, 713 static void free_one_page(struct zone *zone,
714 struct page *page, unsigned long pfn, 714 struct page *page, unsigned long pfn,
715 unsigned int order, 715 unsigned int order,
716 int migratetype) 716 int migratetype)
717 { 717 {
718 spin_lock(&zone->lock); 718 spin_lock(&zone->lock);
719 zone->pages_scanned = 0; 719 zone->pages_scanned = 0;
720 720
721 __free_one_page(page, pfn, zone, order, migratetype); 721 __free_one_page(page, pfn, zone, order, migratetype);
722 if (unlikely(!is_migrate_isolate(migratetype))) 722 if (unlikely(!is_migrate_isolate(migratetype)))
723 __mod_zone_freepage_state(zone, 1 << order, migratetype); 723 __mod_zone_freepage_state(zone, 1 << order, migratetype);
724 spin_unlock(&zone->lock); 724 spin_unlock(&zone->lock);
725 } 725 }
726 726
727 static bool free_pages_prepare(struct page *page, unsigned int order) 727 static bool free_pages_prepare(struct page *page, unsigned int order)
728 { 728 {
729 int i; 729 int i;
730 int bad = 0; 730 int bad = 0;
731 731
732 trace_mm_page_free(page, order); 732 trace_mm_page_free(page, order);
733 kmemcheck_free_shadow(page, order); 733 kmemcheck_free_shadow(page, order);
734 734
735 if (PageAnon(page)) 735 if (PageAnon(page))
736 page->mapping = NULL; 736 page->mapping = NULL;
737 for (i = 0; i < (1 << order); i++) 737 for (i = 0; i < (1 << order); i++)
738 bad += free_pages_check(page + i); 738 bad += free_pages_check(page + i);
739 if (bad) 739 if (bad)
740 return false; 740 return false;
741 741
742 if (!PageHighMem(page)) { 742 if (!PageHighMem(page)) {
743 debug_check_no_locks_freed(page_address(page), 743 debug_check_no_locks_freed(page_address(page),
744 PAGE_SIZE << order); 744 PAGE_SIZE << order);
745 debug_check_no_obj_freed(page_address(page), 745 debug_check_no_obj_freed(page_address(page),
746 PAGE_SIZE << order); 746 PAGE_SIZE << order);
747 } 747 }
748 arch_free_page(page, order); 748 arch_free_page(page, order);
749 kernel_map_pages(page, 1 << order, 0); 749 kernel_map_pages(page, 1 << order, 0);
750 750
751 return true; 751 return true;
752 } 752 }
753 753
754 static void __free_pages_ok(struct page *page, unsigned int order) 754 static void __free_pages_ok(struct page *page, unsigned int order)
755 { 755 {
756 unsigned long flags; 756 unsigned long flags;
757 int migratetype; 757 int migratetype;
758 unsigned long pfn = page_to_pfn(page); 758 unsigned long pfn = page_to_pfn(page);
759 759
760 if (!free_pages_prepare(page, order)) 760 if (!free_pages_prepare(page, order))
761 return; 761 return;
762 762
763 migratetype = get_pfnblock_migratetype(page, pfn);
763 local_irq_save(flags); 764 local_irq_save(flags);
764 __count_vm_events(PGFREE, 1 << order); 765 __count_vm_events(PGFREE, 1 << order);
765 migratetype = get_pfnblock_migratetype(page, pfn);
766 set_freepage_migratetype(page, migratetype); 766 set_freepage_migratetype(page, migratetype);
767 free_one_page(page_zone(page), page, pfn, order, migratetype); 767 free_one_page(page_zone(page), page, pfn, order, migratetype);
768 local_irq_restore(flags); 768 local_irq_restore(flags);
769 } 769 }
770 770
771 void __init __free_pages_bootmem(struct page *page, unsigned int order) 771 void __init __free_pages_bootmem(struct page *page, unsigned int order)
772 { 772 {
773 unsigned int nr_pages = 1 << order; 773 unsigned int nr_pages = 1 << order;
774 struct page *p = page; 774 struct page *p = page;
775 unsigned int loop; 775 unsigned int loop;
776 776
777 prefetchw(p); 777 prefetchw(p);
778 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 778 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
779 prefetchw(p + 1); 779 prefetchw(p + 1);
780 __ClearPageReserved(p); 780 __ClearPageReserved(p);
781 set_page_count(p, 0); 781 set_page_count(p, 0);
782 } 782 }
783 __ClearPageReserved(p); 783 __ClearPageReserved(p);
784 set_page_count(p, 0); 784 set_page_count(p, 0);
785 785
786 page_zone(page)->managed_pages += nr_pages; 786 page_zone(page)->managed_pages += nr_pages;
787 set_page_refcounted(page); 787 set_page_refcounted(page);
788 __free_pages(page, order); 788 __free_pages(page, order);
789 } 789 }
790 790
791 #ifdef CONFIG_CMA 791 #ifdef CONFIG_CMA
792 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 792 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
793 void __init init_cma_reserved_pageblock(struct page *page) 793 void __init init_cma_reserved_pageblock(struct page *page)
794 { 794 {
795 unsigned i = pageblock_nr_pages; 795 unsigned i = pageblock_nr_pages;
796 struct page *p = page; 796 struct page *p = page;
797 797
798 do { 798 do {
799 __ClearPageReserved(p); 799 __ClearPageReserved(p);
800 set_page_count(p, 0); 800 set_page_count(p, 0);
801 } while (++p, --i); 801 } while (++p, --i);
802 802
803 set_pageblock_migratetype(page, MIGRATE_CMA); 803 set_pageblock_migratetype(page, MIGRATE_CMA);
804 804
805 if (pageblock_order >= MAX_ORDER) { 805 if (pageblock_order >= MAX_ORDER) {
806 i = pageblock_nr_pages; 806 i = pageblock_nr_pages;
807 p = page; 807 p = page;
808 do { 808 do {
809 set_page_refcounted(p); 809 set_page_refcounted(p);
810 __free_pages(p, MAX_ORDER - 1); 810 __free_pages(p, MAX_ORDER - 1);
811 p += MAX_ORDER_NR_PAGES; 811 p += MAX_ORDER_NR_PAGES;
812 } while (i -= MAX_ORDER_NR_PAGES); 812 } while (i -= MAX_ORDER_NR_PAGES);
813 } else { 813 } else {
814 set_page_refcounted(page); 814 set_page_refcounted(page);
815 __free_pages(page, pageblock_order); 815 __free_pages(page, pageblock_order);
816 } 816 }
817 817
818 adjust_managed_page_count(page, pageblock_nr_pages); 818 adjust_managed_page_count(page, pageblock_nr_pages);
819 } 819 }
820 #endif 820 #endif
821 821
822 /* 822 /*
823 * The order of subdivision here is critical for the IO subsystem. 823 * The order of subdivision here is critical for the IO subsystem.
824 * Please do not alter this order without good reasons and regression 824 * Please do not alter this order without good reasons and regression
825 * testing. Specifically, as large blocks of memory are subdivided, 825 * testing. Specifically, as large blocks of memory are subdivided,
826 * the order in which smaller blocks are delivered depends on the order 826 * the order in which smaller blocks are delivered depends on the order
827 * they're subdivided in this function. This is the primary factor 827 * they're subdivided in this function. This is the primary factor
828 * influencing the order in which pages are delivered to the IO 828 * influencing the order in which pages are delivered to the IO
829 * subsystem according to empirical testing, and this is also justified 829 * subsystem according to empirical testing, and this is also justified
830 * by considering the behavior of a buddy system containing a single 830 * by considering the behavior of a buddy system containing a single
831 * large block of memory acted on by a series of small allocations. 831 * large block of memory acted on by a series of small allocations.
832 * This behavior is a critical factor in sglist merging's success. 832 * This behavior is a critical factor in sglist merging's success.
833 * 833 *
834 * -- nyc 834 * -- nyc
835 */ 835 */
836 static inline void expand(struct zone *zone, struct page *page, 836 static inline void expand(struct zone *zone, struct page *page,
837 int low, int high, struct free_area *area, 837 int low, int high, struct free_area *area,
838 int migratetype) 838 int migratetype)
839 { 839 {
840 unsigned long size = 1 << high; 840 unsigned long size = 1 << high;
841 841
842 while (high > low) { 842 while (high > low) {
843 area--; 843 area--;
844 high--; 844 high--;
845 size >>= 1; 845 size >>= 1;
846 VM_BUG_ON(bad_range(zone, &page[size])); 846 VM_BUG_ON(bad_range(zone, &page[size]));
847 847
848 #ifdef CONFIG_DEBUG_PAGEALLOC 848 #ifdef CONFIG_DEBUG_PAGEALLOC
849 if (high < debug_guardpage_minorder()) { 849 if (high < debug_guardpage_minorder()) {
850 /* 850 /*
851 * Mark as guard pages (or page), that will allow to 851 * Mark as guard pages (or page), that will allow to
852 * merge back to allocator when buddy will be freed. 852 * merge back to allocator when buddy will be freed.
853 * Corresponding page table entries will not be touched, 853 * Corresponding page table entries will not be touched,
854 * pages will stay not present in virtual address space 854 * pages will stay not present in virtual address space
855 */ 855 */
856 INIT_LIST_HEAD(&page[size].lru); 856 INIT_LIST_HEAD(&page[size].lru);
857 set_page_guard_flag(&page[size]); 857 set_page_guard_flag(&page[size]);
858 set_page_private(&page[size], high); 858 set_page_private(&page[size], high);
859 /* Guard pages are not available for any usage */ 859 /* Guard pages are not available for any usage */
860 __mod_zone_freepage_state(zone, -(1 << high), 860 __mod_zone_freepage_state(zone, -(1 << high),
861 migratetype); 861 migratetype);
862 continue; 862 continue;
863 } 863 }
864 #endif 864 #endif
865 list_add(&page[size].lru, &area->free_list[migratetype]); 865 list_add(&page[size].lru, &area->free_list[migratetype]);
866 area->nr_free++; 866 area->nr_free++;
867 set_page_order(&page[size], high); 867 set_page_order(&page[size], high);
868 } 868 }
869 } 869 }
870 870
871 /* 871 /*
872 * This page is about to be returned from the page allocator 872 * This page is about to be returned from the page allocator
873 */ 873 */
874 static inline int check_new_page(struct page *page) 874 static inline int check_new_page(struct page *page)
875 { 875 {
876 if (unlikely(page_mapcount(page) | 876 if (unlikely(page_mapcount(page) |
877 (page->mapping != NULL) | 877 (page->mapping != NULL) |
878 (atomic_read(&page->_count) != 0) | 878 (atomic_read(&page->_count) != 0) |
879 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 879 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
880 (mem_cgroup_bad_page_check(page)))) { 880 (mem_cgroup_bad_page_check(page)))) {
881 bad_page(page); 881 bad_page(page);
882 return 1; 882 return 1;
883 } 883 }
884 return 0; 884 return 0;
885 } 885 }
886 886
887 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 887 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
888 { 888 {
889 int i; 889 int i;
890 890
891 for (i = 0; i < (1 << order); i++) { 891 for (i = 0; i < (1 << order); i++) {
892 struct page *p = page + i; 892 struct page *p = page + i;
893 if (unlikely(check_new_page(p))) 893 if (unlikely(check_new_page(p)))
894 return 1; 894 return 1;
895 } 895 }
896 896
897 set_page_private(page, 0); 897 set_page_private(page, 0);
898 set_page_refcounted(page); 898 set_page_refcounted(page);
899 899
900 arch_alloc_page(page, order); 900 arch_alloc_page(page, order);
901 kernel_map_pages(page, 1 << order, 1); 901 kernel_map_pages(page, 1 << order, 1);
902 902
903 if (gfp_flags & __GFP_ZERO) 903 if (gfp_flags & __GFP_ZERO)
904 prep_zero_page(page, order, gfp_flags); 904 prep_zero_page(page, order, gfp_flags);
905 905
906 if (order && (gfp_flags & __GFP_COMP)) 906 if (order && (gfp_flags & __GFP_COMP))
907 prep_compound_page(page, order); 907 prep_compound_page(page, order);
908 908
909 return 0; 909 return 0;
910 } 910 }
911 911
912 /* 912 /*
913 * Go through the free lists for the given migratetype and remove 913 * Go through the free lists for the given migratetype and remove
914 * the smallest available page from the freelists 914 * the smallest available page from the freelists
915 */ 915 */
916 static inline 916 static inline
917 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 917 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
918 int migratetype) 918 int migratetype)
919 { 919 {
920 unsigned int current_order; 920 unsigned int current_order;
921 struct free_area *area; 921 struct free_area *area;
922 struct page *page; 922 struct page *page;
923 923
924 /* Find a page of the appropriate size in the preferred list */ 924 /* Find a page of the appropriate size in the preferred list */
925 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 925 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
926 area = &(zone->free_area[current_order]); 926 area = &(zone->free_area[current_order]);
927 if (list_empty(&area->free_list[migratetype])) 927 if (list_empty(&area->free_list[migratetype]))
928 continue; 928 continue;
929 929
930 page = list_entry(area->free_list[migratetype].next, 930 page = list_entry(area->free_list[migratetype].next,
931 struct page, lru); 931 struct page, lru);
932 list_del(&page->lru); 932 list_del(&page->lru);
933 rmv_page_order(page); 933 rmv_page_order(page);
934 area->nr_free--; 934 area->nr_free--;
935 expand(zone, page, order, current_order, area, migratetype); 935 expand(zone, page, order, current_order, area, migratetype);
936 set_freepage_migratetype(page, migratetype); 936 set_freepage_migratetype(page, migratetype);
937 return page; 937 return page;
938 } 938 }
939 939
940 return NULL; 940 return NULL;
941 } 941 }
942 942
943 943
944 /* 944 /*
945 * This array describes the order lists are fallen back to when 945 * This array describes the order lists are fallen back to when
946 * the free lists for the desirable migrate type are depleted 946 * the free lists for the desirable migrate type are depleted
947 */ 947 */
948 static int fallbacks[MIGRATE_TYPES][4] = { 948 static int fallbacks[MIGRATE_TYPES][4] = {
949 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 949 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
950 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 950 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
951 #ifdef CONFIG_CMA 951 #ifdef CONFIG_CMA
952 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 952 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
953 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 953 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
954 #else 954 #else
955 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 955 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
956 #endif 956 #endif
957 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 957 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
958 #ifdef CONFIG_MEMORY_ISOLATION 958 #ifdef CONFIG_MEMORY_ISOLATION
959 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 959 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
960 #endif 960 #endif
961 }; 961 };
962 962
963 /* 963 /*
964 * Move the free pages in a range to the free lists of the requested type. 964 * Move the free pages in a range to the free lists of the requested type.
965 * Note that start_page and end_pages are not aligned on a pageblock 965 * Note that start_page and end_pages are not aligned on a pageblock
966 * boundary. If alignment is required, use move_freepages_block() 966 * boundary. If alignment is required, use move_freepages_block()
967 */ 967 */
968 int move_freepages(struct zone *zone, 968 int move_freepages(struct zone *zone,
969 struct page *start_page, struct page *end_page, 969 struct page *start_page, struct page *end_page,
970 int migratetype) 970 int migratetype)
971 { 971 {
972 struct page *page; 972 struct page *page;
973 unsigned long order; 973 unsigned long order;
974 int pages_moved = 0; 974 int pages_moved = 0;
975 975
976 #ifndef CONFIG_HOLES_IN_ZONE 976 #ifndef CONFIG_HOLES_IN_ZONE
977 /* 977 /*
978 * page_zone is not safe to call in this context when 978 * page_zone is not safe to call in this context when
979 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 979 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
980 * anyway as we check zone boundaries in move_freepages_block(). 980 * anyway as we check zone boundaries in move_freepages_block().
981 * Remove at a later date when no bug reports exist related to 981 * Remove at a later date when no bug reports exist related to
982 * grouping pages by mobility 982 * grouping pages by mobility
983 */ 983 */
984 BUG_ON(page_zone(start_page) != page_zone(end_page)); 984 BUG_ON(page_zone(start_page) != page_zone(end_page));
985 #endif 985 #endif
986 986
987 for (page = start_page; page <= end_page;) { 987 for (page = start_page; page <= end_page;) {
988 /* Make sure we are not inadvertently changing nodes */ 988 /* Make sure we are not inadvertently changing nodes */
989 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 989 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
990 990
991 if (!pfn_valid_within(page_to_pfn(page))) { 991 if (!pfn_valid_within(page_to_pfn(page))) {
992 page++; 992 page++;
993 continue; 993 continue;
994 } 994 }
995 995
996 if (!PageBuddy(page)) { 996 if (!PageBuddy(page)) {
997 page++; 997 page++;
998 continue; 998 continue;
999 } 999 }
1000 1000
1001 order = page_order(page); 1001 order = page_order(page);
1002 list_move(&page->lru, 1002 list_move(&page->lru,
1003 &zone->free_area[order].free_list[migratetype]); 1003 &zone->free_area[order].free_list[migratetype]);
1004 set_freepage_migratetype(page, migratetype); 1004 set_freepage_migratetype(page, migratetype);
1005 page += 1 << order; 1005 page += 1 << order;
1006 pages_moved += 1 << order; 1006 pages_moved += 1 << order;
1007 } 1007 }
1008 1008
1009 return pages_moved; 1009 return pages_moved;
1010 } 1010 }
1011 1011
1012 int move_freepages_block(struct zone *zone, struct page *page, 1012 int move_freepages_block(struct zone *zone, struct page *page,
1013 int migratetype) 1013 int migratetype)
1014 { 1014 {
1015 unsigned long start_pfn, end_pfn; 1015 unsigned long start_pfn, end_pfn;
1016 struct page *start_page, *end_page; 1016 struct page *start_page, *end_page;
1017 1017
1018 start_pfn = page_to_pfn(page); 1018 start_pfn = page_to_pfn(page);
1019 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1019 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1020 start_page = pfn_to_page(start_pfn); 1020 start_page = pfn_to_page(start_pfn);
1021 end_page = start_page + pageblock_nr_pages - 1; 1021 end_page = start_page + pageblock_nr_pages - 1;
1022 end_pfn = start_pfn + pageblock_nr_pages - 1; 1022 end_pfn = start_pfn + pageblock_nr_pages - 1;
1023 1023
1024 /* Do not cross zone boundaries */ 1024 /* Do not cross zone boundaries */
1025 if (!zone_spans_pfn(zone, start_pfn)) 1025 if (!zone_spans_pfn(zone, start_pfn))
1026 start_page = page; 1026 start_page = page;
1027 if (!zone_spans_pfn(zone, end_pfn)) 1027 if (!zone_spans_pfn(zone, end_pfn))
1028 return 0; 1028 return 0;
1029 1029
1030 return move_freepages(zone, start_page, end_page, migratetype); 1030 return move_freepages(zone, start_page, end_page, migratetype);
1031 } 1031 }
1032 1032
1033 static void change_pageblock_range(struct page *pageblock_page, 1033 static void change_pageblock_range(struct page *pageblock_page,
1034 int start_order, int migratetype) 1034 int start_order, int migratetype)
1035 { 1035 {
1036 int nr_pageblocks = 1 << (start_order - pageblock_order); 1036 int nr_pageblocks = 1 << (start_order - pageblock_order);
1037 1037
1038 while (nr_pageblocks--) { 1038 while (nr_pageblocks--) {
1039 set_pageblock_migratetype(pageblock_page, migratetype); 1039 set_pageblock_migratetype(pageblock_page, migratetype);
1040 pageblock_page += pageblock_nr_pages; 1040 pageblock_page += pageblock_nr_pages;
1041 } 1041 }
1042 } 1042 }
1043 1043
1044 /* 1044 /*
1045 * If breaking a large block of pages, move all free pages to the preferred 1045 * If breaking a large block of pages, move all free pages to the preferred
1046 * allocation list. If falling back for a reclaimable kernel allocation, be 1046 * allocation list. If falling back for a reclaimable kernel allocation, be
1047 * more aggressive about taking ownership of free pages. 1047 * more aggressive about taking ownership of free pages.
1048 * 1048 *
1049 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1049 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1050 * nor move CMA pages to different free lists. We don't want unmovable pages 1050 * nor move CMA pages to different free lists. We don't want unmovable pages
1051 * to be allocated from MIGRATE_CMA areas. 1051 * to be allocated from MIGRATE_CMA areas.
1052 * 1052 *
1053 * Returns the new migratetype of the pageblock (or the same old migratetype 1053 * Returns the new migratetype of the pageblock (or the same old migratetype
1054 * if it was unchanged). 1054 * if it was unchanged).
1055 */ 1055 */
1056 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1056 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1057 int start_type, int fallback_type) 1057 int start_type, int fallback_type)
1058 { 1058 {
1059 int current_order = page_order(page); 1059 int current_order = page_order(page);
1060 1060
1061 /* 1061 /*
1062 * When borrowing from MIGRATE_CMA, we need to release the excess 1062 * When borrowing from MIGRATE_CMA, we need to release the excess
1063 * buddy pages to CMA itself. We also ensure the freepage_migratetype 1063 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1064 * is set to CMA so it is returned to the correct freelist in case 1064 * is set to CMA so it is returned to the correct freelist in case
1065 * the page ends up being not actually allocated from the pcp lists. 1065 * the page ends up being not actually allocated from the pcp lists.
1066 */ 1066 */
1067 if (is_migrate_cma(fallback_type)) 1067 if (is_migrate_cma(fallback_type))
1068 return fallback_type; 1068 return fallback_type;
1069 1069
1070 /* Take ownership for orders >= pageblock_order */ 1070 /* Take ownership for orders >= pageblock_order */
1071 if (current_order >= pageblock_order) { 1071 if (current_order >= pageblock_order) {
1072 change_pageblock_range(page, current_order, start_type); 1072 change_pageblock_range(page, current_order, start_type);
1073 return start_type; 1073 return start_type;
1074 } 1074 }
1075 1075
1076 if (current_order >= pageblock_order / 2 || 1076 if (current_order >= pageblock_order / 2 ||
1077 start_type == MIGRATE_RECLAIMABLE || 1077 start_type == MIGRATE_RECLAIMABLE ||
1078 page_group_by_mobility_disabled) { 1078 page_group_by_mobility_disabled) {
1079 int pages; 1079 int pages;
1080 1080
1081 pages = move_freepages_block(zone, page, start_type); 1081 pages = move_freepages_block(zone, page, start_type);
1082 1082
1083 /* Claim the whole block if over half of it is free */ 1083 /* Claim the whole block if over half of it is free */
1084 if (pages >= (1 << (pageblock_order-1)) || 1084 if (pages >= (1 << (pageblock_order-1)) ||
1085 page_group_by_mobility_disabled) { 1085 page_group_by_mobility_disabled) {
1086 1086
1087 set_pageblock_migratetype(page, start_type); 1087 set_pageblock_migratetype(page, start_type);
1088 return start_type; 1088 return start_type;
1089 } 1089 }
1090 1090
1091 } 1091 }
1092 1092
1093 return fallback_type; 1093 return fallback_type;
1094 } 1094 }
1095 1095
1096 /* Remove an element from the buddy allocator from the fallback list */ 1096 /* Remove an element from the buddy allocator from the fallback list */
1097 static inline struct page * 1097 static inline struct page *
1098 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) 1098 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1099 { 1099 {
1100 struct free_area *area; 1100 struct free_area *area;
1101 unsigned int current_order; 1101 unsigned int current_order;
1102 struct page *page; 1102 struct page *page;
1103 int migratetype, new_type, i; 1103 int migratetype, new_type, i;
1104 1104
1105 /* Find the largest possible block of pages in the other list */ 1105 /* Find the largest possible block of pages in the other list */
1106 for (current_order = MAX_ORDER-1; 1106 for (current_order = MAX_ORDER-1;
1107 current_order >= order && current_order <= MAX_ORDER-1; 1107 current_order >= order && current_order <= MAX_ORDER-1;
1108 --current_order) { 1108 --current_order) {
1109 for (i = 0;; i++) { 1109 for (i = 0;; i++) {
1110 migratetype = fallbacks[start_migratetype][i]; 1110 migratetype = fallbacks[start_migratetype][i];
1111 1111
1112 /* MIGRATE_RESERVE handled later if necessary */ 1112 /* MIGRATE_RESERVE handled later if necessary */
1113 if (migratetype == MIGRATE_RESERVE) 1113 if (migratetype == MIGRATE_RESERVE)
1114 break; 1114 break;
1115 1115
1116 area = &(zone->free_area[current_order]); 1116 area = &(zone->free_area[current_order]);
1117 if (list_empty(&area->free_list[migratetype])) 1117 if (list_empty(&area->free_list[migratetype]))
1118 continue; 1118 continue;
1119 1119
1120 page = list_entry(area->free_list[migratetype].next, 1120 page = list_entry(area->free_list[migratetype].next,
1121 struct page, lru); 1121 struct page, lru);
1122 area->nr_free--; 1122 area->nr_free--;
1123 1123
1124 new_type = try_to_steal_freepages(zone, page, 1124 new_type = try_to_steal_freepages(zone, page,
1125 start_migratetype, 1125 start_migratetype,
1126 migratetype); 1126 migratetype);
1127 1127
1128 /* Remove the page from the freelists */ 1128 /* Remove the page from the freelists */
1129 list_del(&page->lru); 1129 list_del(&page->lru);
1130 rmv_page_order(page); 1130 rmv_page_order(page);
1131 1131
1132 expand(zone, page, order, current_order, area, 1132 expand(zone, page, order, current_order, area,
1133 new_type); 1133 new_type);
1134 /* The freepage_migratetype may differ from pageblock's 1134 /* The freepage_migratetype may differ from pageblock's
1135 * migratetype depending on the decisions in 1135 * migratetype depending on the decisions in
1136 * try_to_steal_freepages. This is OK as long as it does 1136 * try_to_steal_freepages. This is OK as long as it does
1137 * not differ for MIGRATE_CMA type. 1137 * not differ for MIGRATE_CMA type.
1138 */ 1138 */
1139 set_freepage_migratetype(page, new_type); 1139 set_freepage_migratetype(page, new_type);
1140 1140
1141 trace_mm_page_alloc_extfrag(page, order, current_order, 1141 trace_mm_page_alloc_extfrag(page, order, current_order,
1142 start_migratetype, migratetype, new_type); 1142 start_migratetype, migratetype, new_type);
1143 1143
1144 return page; 1144 return page;
1145 } 1145 }
1146 } 1146 }
1147 1147
1148 return NULL; 1148 return NULL;
1149 } 1149 }
1150 1150
1151 /* 1151 /*
1152 * Do the hard work of removing an element from the buddy allocator. 1152 * Do the hard work of removing an element from the buddy allocator.
1153 * Call me with the zone->lock already held. 1153 * Call me with the zone->lock already held.
1154 */ 1154 */
1155 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1155 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1156 int migratetype) 1156 int migratetype)
1157 { 1157 {
1158 struct page *page; 1158 struct page *page;
1159 1159
1160 retry_reserve: 1160 retry_reserve:
1161 page = __rmqueue_smallest(zone, order, migratetype); 1161 page = __rmqueue_smallest(zone, order, migratetype);
1162 1162
1163 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1163 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1164 page = __rmqueue_fallback(zone, order, migratetype); 1164 page = __rmqueue_fallback(zone, order, migratetype);
1165 1165
1166 /* 1166 /*
1167 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1167 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1168 * is used because __rmqueue_smallest is an inline function 1168 * is used because __rmqueue_smallest is an inline function
1169 * and we want just one call site 1169 * and we want just one call site
1170 */ 1170 */
1171 if (!page) { 1171 if (!page) {
1172 migratetype = MIGRATE_RESERVE; 1172 migratetype = MIGRATE_RESERVE;
1173 goto retry_reserve; 1173 goto retry_reserve;
1174 } 1174 }
1175 } 1175 }
1176 1176
1177 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1177 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1178 return page; 1178 return page;
1179 } 1179 }
1180 1180
1181 /* 1181 /*
1182 * Obtain a specified number of elements from the buddy allocator, all under 1182 * Obtain a specified number of elements from the buddy allocator, all under
1183 * a single hold of the lock, for efficiency. Add them to the supplied list. 1183 * a single hold of the lock, for efficiency. Add them to the supplied list.
1184 * Returns the number of new pages which were placed at *list. 1184 * Returns the number of new pages which were placed at *list.
1185 */ 1185 */
1186 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1186 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1187 unsigned long count, struct list_head *list, 1187 unsigned long count, struct list_head *list,
1188 int migratetype, bool cold) 1188 int migratetype, bool cold)
1189 { 1189 {
1190 int i; 1190 int i;
1191 1191
1192 spin_lock(&zone->lock); 1192 spin_lock(&zone->lock);
1193 for (i = 0; i < count; ++i) { 1193 for (i = 0; i < count; ++i) {
1194 struct page *page = __rmqueue(zone, order, migratetype); 1194 struct page *page = __rmqueue(zone, order, migratetype);
1195 if (unlikely(page == NULL)) 1195 if (unlikely(page == NULL))
1196 break; 1196 break;
1197 1197
1198 /* 1198 /*
1199 * Split buddy pages returned by expand() are received here 1199 * Split buddy pages returned by expand() are received here
1200 * in physical page order. The page is added to the callers and 1200 * in physical page order. The page is added to the callers and
1201 * list and the list head then moves forward. From the callers 1201 * list and the list head then moves forward. From the callers
1202 * perspective, the linked list is ordered by page number in 1202 * perspective, the linked list is ordered by page number in
1203 * some conditions. This is useful for IO devices that can 1203 * some conditions. This is useful for IO devices that can
1204 * merge IO requests if the physical pages are ordered 1204 * merge IO requests if the physical pages are ordered
1205 * properly. 1205 * properly.
1206 */ 1206 */
1207 if (likely(!cold)) 1207 if (likely(!cold))
1208 list_add(&page->lru, list); 1208 list_add(&page->lru, list);
1209 else 1209 else
1210 list_add_tail(&page->lru, list); 1210 list_add_tail(&page->lru, list);
1211 list = &page->lru; 1211 list = &page->lru;
1212 if (is_migrate_cma(get_freepage_migratetype(page))) 1212 if (is_migrate_cma(get_freepage_migratetype(page)))
1213 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1213 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1214 -(1 << order)); 1214 -(1 << order));
1215 } 1215 }
1216 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1216 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1217 spin_unlock(&zone->lock); 1217 spin_unlock(&zone->lock);
1218 return i; 1218 return i;
1219 } 1219 }
1220 1220
1221 #ifdef CONFIG_NUMA 1221 #ifdef CONFIG_NUMA
1222 /* 1222 /*
1223 * Called from the vmstat counter updater to drain pagesets of this 1223 * Called from the vmstat counter updater to drain pagesets of this
1224 * currently executing processor on remote nodes after they have 1224 * currently executing processor on remote nodes after they have
1225 * expired. 1225 * expired.
1226 * 1226 *
1227 * Note that this function must be called with the thread pinned to 1227 * Note that this function must be called with the thread pinned to
1228 * a single processor. 1228 * a single processor.
1229 */ 1229 */
1230 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1230 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1231 { 1231 {
1232 unsigned long flags; 1232 unsigned long flags;
1233 int to_drain; 1233 int to_drain;
1234 unsigned long batch; 1234 unsigned long batch;
1235 1235
1236 local_irq_save(flags); 1236 local_irq_save(flags);
1237 batch = ACCESS_ONCE(pcp->batch); 1237 batch = ACCESS_ONCE(pcp->batch);
1238 if (pcp->count >= batch) 1238 if (pcp->count >= batch)
1239 to_drain = batch; 1239 to_drain = batch;
1240 else 1240 else
1241 to_drain = pcp->count; 1241 to_drain = pcp->count;
1242 if (to_drain > 0) { 1242 if (to_drain > 0) {
1243 free_pcppages_bulk(zone, to_drain, pcp); 1243 free_pcppages_bulk(zone, to_drain, pcp);
1244 pcp->count -= to_drain; 1244 pcp->count -= to_drain;
1245 } 1245 }
1246 local_irq_restore(flags); 1246 local_irq_restore(flags);
1247 } 1247 }
1248 #endif 1248 #endif
1249 1249
1250 /* 1250 /*
1251 * Drain pages of the indicated processor. 1251 * Drain pages of the indicated processor.
1252 * 1252 *
1253 * The processor must either be the current processor and the 1253 * The processor must either be the current processor and the
1254 * thread pinned to the current processor or a processor that 1254 * thread pinned to the current processor or a processor that
1255 * is not online. 1255 * is not online.
1256 */ 1256 */
1257 static void drain_pages(unsigned int cpu) 1257 static void drain_pages(unsigned int cpu)
1258 { 1258 {
1259 unsigned long flags; 1259 unsigned long flags;
1260 struct zone *zone; 1260 struct zone *zone;
1261 1261
1262 for_each_populated_zone(zone) { 1262 for_each_populated_zone(zone) {
1263 struct per_cpu_pageset *pset; 1263 struct per_cpu_pageset *pset;
1264 struct per_cpu_pages *pcp; 1264 struct per_cpu_pages *pcp;
1265 1265
1266 local_irq_save(flags); 1266 local_irq_save(flags);
1267 pset = per_cpu_ptr(zone->pageset, cpu); 1267 pset = per_cpu_ptr(zone->pageset, cpu);
1268 1268
1269 pcp = &pset->pcp; 1269 pcp = &pset->pcp;
1270 if (pcp->count) { 1270 if (pcp->count) {
1271 free_pcppages_bulk(zone, pcp->count, pcp); 1271 free_pcppages_bulk(zone, pcp->count, pcp);
1272 pcp->count = 0; 1272 pcp->count = 0;
1273 } 1273 }
1274 local_irq_restore(flags); 1274 local_irq_restore(flags);
1275 } 1275 }
1276 } 1276 }
1277 1277
1278 /* 1278 /*
1279 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1279 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1280 */ 1280 */
1281 void drain_local_pages(void *arg) 1281 void drain_local_pages(void *arg)
1282 { 1282 {
1283 drain_pages(smp_processor_id()); 1283 drain_pages(smp_processor_id());
1284 } 1284 }
1285 1285
1286 /* 1286 /*
1287 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1287 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1288 * 1288 *
1289 * Note that this code is protected against sending an IPI to an offline 1289 * Note that this code is protected against sending an IPI to an offline
1290 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1290 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1291 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1291 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1292 * nothing keeps CPUs from showing up after we populated the cpumask and 1292 * nothing keeps CPUs from showing up after we populated the cpumask and
1293 * before the call to on_each_cpu_mask(). 1293 * before the call to on_each_cpu_mask().
1294 */ 1294 */
1295 void drain_all_pages(void) 1295 void drain_all_pages(void)
1296 { 1296 {
1297 int cpu; 1297 int cpu;
1298 struct per_cpu_pageset *pcp; 1298 struct per_cpu_pageset *pcp;
1299 struct zone *zone; 1299 struct zone *zone;
1300 1300
1301 /* 1301 /*
1302 * Allocate in the BSS so we wont require allocation in 1302 * Allocate in the BSS so we wont require allocation in
1303 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1303 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1304 */ 1304 */
1305 static cpumask_t cpus_with_pcps; 1305 static cpumask_t cpus_with_pcps;
1306 1306
1307 /* 1307 /*
1308 * We don't care about racing with CPU hotplug event 1308 * We don't care about racing with CPU hotplug event
1309 * as offline notification will cause the notified 1309 * as offline notification will cause the notified
1310 * cpu to drain that CPU pcps and on_each_cpu_mask 1310 * cpu to drain that CPU pcps and on_each_cpu_mask
1311 * disables preemption as part of its processing 1311 * disables preemption as part of its processing
1312 */ 1312 */
1313 for_each_online_cpu(cpu) { 1313 for_each_online_cpu(cpu) {
1314 bool has_pcps = false; 1314 bool has_pcps = false;
1315 for_each_populated_zone(zone) { 1315 for_each_populated_zone(zone) {
1316 pcp = per_cpu_ptr(zone->pageset, cpu); 1316 pcp = per_cpu_ptr(zone->pageset, cpu);
1317 if (pcp->pcp.count) { 1317 if (pcp->pcp.count) {
1318 has_pcps = true; 1318 has_pcps = true;
1319 break; 1319 break;
1320 } 1320 }
1321 } 1321 }
1322 if (has_pcps) 1322 if (has_pcps)
1323 cpumask_set_cpu(cpu, &cpus_with_pcps); 1323 cpumask_set_cpu(cpu, &cpus_with_pcps);
1324 else 1324 else
1325 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1325 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1326 } 1326 }
1327 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1327 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1328 } 1328 }
1329 1329
1330 #ifdef CONFIG_HIBERNATION 1330 #ifdef CONFIG_HIBERNATION
1331 1331
1332 void mark_free_pages(struct zone *zone) 1332 void mark_free_pages(struct zone *zone)
1333 { 1333 {
1334 unsigned long pfn, max_zone_pfn; 1334 unsigned long pfn, max_zone_pfn;
1335 unsigned long flags; 1335 unsigned long flags;
1336 unsigned int order, t; 1336 unsigned int order, t;
1337 struct list_head *curr; 1337 struct list_head *curr;
1338 1338
1339 if (zone_is_empty(zone)) 1339 if (zone_is_empty(zone))
1340 return; 1340 return;
1341 1341
1342 spin_lock_irqsave(&zone->lock, flags); 1342 spin_lock_irqsave(&zone->lock, flags);
1343 1343
1344 max_zone_pfn = zone_end_pfn(zone); 1344 max_zone_pfn = zone_end_pfn(zone);
1345 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1345 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1346 if (pfn_valid(pfn)) { 1346 if (pfn_valid(pfn)) {
1347 struct page *page = pfn_to_page(pfn); 1347 struct page *page = pfn_to_page(pfn);
1348 1348
1349 if (!swsusp_page_is_forbidden(page)) 1349 if (!swsusp_page_is_forbidden(page))
1350 swsusp_unset_page_free(page); 1350 swsusp_unset_page_free(page);
1351 } 1351 }
1352 1352
1353 for_each_migratetype_order(order, t) { 1353 for_each_migratetype_order(order, t) {
1354 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1354 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1355 unsigned long i; 1355 unsigned long i;
1356 1356
1357 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1357 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1358 for (i = 0; i < (1UL << order); i++) 1358 for (i = 0; i < (1UL << order); i++)
1359 swsusp_set_page_free(pfn_to_page(pfn + i)); 1359 swsusp_set_page_free(pfn_to_page(pfn + i));
1360 } 1360 }
1361 } 1361 }
1362 spin_unlock_irqrestore(&zone->lock, flags); 1362 spin_unlock_irqrestore(&zone->lock, flags);
1363 } 1363 }
1364 #endif /* CONFIG_PM */ 1364 #endif /* CONFIG_PM */
1365 1365
1366 /* 1366 /*
1367 * Free a 0-order page 1367 * Free a 0-order page
1368 * cold == true ? free a cold page : free a hot page 1368 * cold == true ? free a cold page : free a hot page
1369 */ 1369 */
1370 void free_hot_cold_page(struct page *page, bool cold) 1370 void free_hot_cold_page(struct page *page, bool cold)
1371 { 1371 {
1372 struct zone *zone = page_zone(page); 1372 struct zone *zone = page_zone(page);
1373 struct per_cpu_pages *pcp; 1373 struct per_cpu_pages *pcp;
1374 unsigned long flags; 1374 unsigned long flags;
1375 unsigned long pfn = page_to_pfn(page); 1375 unsigned long pfn = page_to_pfn(page);
1376 int migratetype; 1376 int migratetype;
1377 1377
1378 if (!free_pages_prepare(page, 0)) 1378 if (!free_pages_prepare(page, 0))
1379 return; 1379 return;
1380 1380
1381 migratetype = get_pfnblock_migratetype(page, pfn); 1381 migratetype = get_pfnblock_migratetype(page, pfn);
1382 set_freepage_migratetype(page, migratetype); 1382 set_freepage_migratetype(page, migratetype);
1383 local_irq_save(flags); 1383 local_irq_save(flags);
1384 __count_vm_event(PGFREE); 1384 __count_vm_event(PGFREE);
1385 1385
1386 /* 1386 /*
1387 * We only track unmovable, reclaimable and movable on pcp lists. 1387 * We only track unmovable, reclaimable and movable on pcp lists.
1388 * Free ISOLATE pages back to the allocator because they are being 1388 * Free ISOLATE pages back to the allocator because they are being
1389 * offlined but treat RESERVE as movable pages so we can get those 1389 * offlined but treat RESERVE as movable pages so we can get those
1390 * areas back if necessary. Otherwise, we may have to free 1390 * areas back if necessary. Otherwise, we may have to free
1391 * excessively into the page allocator 1391 * excessively into the page allocator
1392 */ 1392 */
1393 if (migratetype >= MIGRATE_PCPTYPES) { 1393 if (migratetype >= MIGRATE_PCPTYPES) {
1394 if (unlikely(is_migrate_isolate(migratetype))) { 1394 if (unlikely(is_migrate_isolate(migratetype))) {
1395 free_one_page(zone, page, pfn, 0, migratetype); 1395 free_one_page(zone, page, pfn, 0, migratetype);
1396 goto out; 1396 goto out;
1397 } 1397 }
1398 migratetype = MIGRATE_MOVABLE; 1398 migratetype = MIGRATE_MOVABLE;
1399 } 1399 }
1400 1400
1401 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1401 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1402 if (!cold) 1402 if (!cold)
1403 list_add(&page->lru, &pcp->lists[migratetype]); 1403 list_add(&page->lru, &pcp->lists[migratetype]);
1404 else 1404 else
1405 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1405 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1406 pcp->count++; 1406 pcp->count++;
1407 if (pcp->count >= pcp->high) { 1407 if (pcp->count >= pcp->high) {
1408 unsigned long batch = ACCESS_ONCE(pcp->batch); 1408 unsigned long batch = ACCESS_ONCE(pcp->batch);
1409 free_pcppages_bulk(zone, batch, pcp); 1409 free_pcppages_bulk(zone, batch, pcp);
1410 pcp->count -= batch; 1410 pcp->count -= batch;
1411 } 1411 }
1412 1412
1413 out: 1413 out:
1414 local_irq_restore(flags); 1414 local_irq_restore(flags);
1415 } 1415 }
1416 1416
1417 /* 1417 /*
1418 * Free a list of 0-order pages 1418 * Free a list of 0-order pages
1419 */ 1419 */
1420 void free_hot_cold_page_list(struct list_head *list, bool cold) 1420 void free_hot_cold_page_list(struct list_head *list, bool cold)
1421 { 1421 {
1422 struct page *page, *next; 1422 struct page *page, *next;
1423 1423
1424 list_for_each_entry_safe(page, next, list, lru) { 1424 list_for_each_entry_safe(page, next, list, lru) {
1425 trace_mm_page_free_batched(page, cold); 1425 trace_mm_page_free_batched(page, cold);
1426 free_hot_cold_page(page, cold); 1426 free_hot_cold_page(page, cold);
1427 } 1427 }
1428 } 1428 }
1429 1429
1430 /* 1430 /*
1431 * split_page takes a non-compound higher-order page, and splits it into 1431 * split_page takes a non-compound higher-order page, and splits it into
1432 * n (1<<order) sub-pages: page[0..n] 1432 * n (1<<order) sub-pages: page[0..n]
1433 * Each sub-page must be freed individually. 1433 * Each sub-page must be freed individually.
1434 * 1434 *
1435 * Note: this is probably too low level an operation for use in drivers. 1435 * Note: this is probably too low level an operation for use in drivers.
1436 * Please consult with lkml before using this in your driver. 1436 * Please consult with lkml before using this in your driver.
1437 */ 1437 */
1438 void split_page(struct page *page, unsigned int order) 1438 void split_page(struct page *page, unsigned int order)
1439 { 1439 {
1440 int i; 1440 int i;
1441 1441
1442 VM_BUG_ON(PageCompound(page)); 1442 VM_BUG_ON(PageCompound(page));
1443 VM_BUG_ON(!page_count(page)); 1443 VM_BUG_ON(!page_count(page));
1444 1444
1445 #ifdef CONFIG_KMEMCHECK 1445 #ifdef CONFIG_KMEMCHECK
1446 /* 1446 /*
1447 * Split shadow pages too, because free(page[0]) would 1447 * Split shadow pages too, because free(page[0]) would
1448 * otherwise free the whole shadow. 1448 * otherwise free the whole shadow.
1449 */ 1449 */
1450 if (kmemcheck_page_is_tracked(page)) 1450 if (kmemcheck_page_is_tracked(page))
1451 split_page(virt_to_page(page[0].shadow), order); 1451 split_page(virt_to_page(page[0].shadow), order);
1452 #endif 1452 #endif
1453 1453
1454 for (i = 1; i < (1 << order); i++) 1454 for (i = 1; i < (1 << order); i++)
1455 set_page_refcounted(page + i); 1455 set_page_refcounted(page + i);
1456 } 1456 }
1457 EXPORT_SYMBOL_GPL(split_page); 1457 EXPORT_SYMBOL_GPL(split_page);
1458 1458
1459 static int __isolate_free_page(struct page *page, unsigned int order) 1459 static int __isolate_free_page(struct page *page, unsigned int order)
1460 { 1460 {
1461 unsigned long watermark; 1461 unsigned long watermark;
1462 struct zone *zone; 1462 struct zone *zone;
1463 int mt; 1463 int mt;
1464 1464
1465 BUG_ON(!PageBuddy(page)); 1465 BUG_ON(!PageBuddy(page));
1466 1466
1467 zone = page_zone(page); 1467 zone = page_zone(page);
1468 mt = get_pageblock_migratetype(page); 1468 mt = get_pageblock_migratetype(page);
1469 1469
1470 if (!is_migrate_isolate(mt)) { 1470 if (!is_migrate_isolate(mt)) {
1471 /* Obey watermarks as if the page was being allocated */ 1471 /* Obey watermarks as if the page was being allocated */
1472 watermark = low_wmark_pages(zone) + (1 << order); 1472 watermark = low_wmark_pages(zone) + (1 << order);
1473 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1473 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1474 return 0; 1474 return 0;
1475 1475
1476 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1476 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1477 } 1477 }
1478 1478
1479 /* Remove page from free list */ 1479 /* Remove page from free list */
1480 list_del(&page->lru); 1480 list_del(&page->lru);
1481 zone->free_area[order].nr_free--; 1481 zone->free_area[order].nr_free--;
1482 rmv_page_order(page); 1482 rmv_page_order(page);
1483 1483
1484 /* Set the pageblock if the isolated page is at least a pageblock */ 1484 /* Set the pageblock if the isolated page is at least a pageblock */
1485 if (order >= pageblock_order - 1) { 1485 if (order >= pageblock_order - 1) {
1486 struct page *endpage = page + (1 << order) - 1; 1486 struct page *endpage = page + (1 << order) - 1;
1487 for (; page < endpage; page += pageblock_nr_pages) { 1487 for (; page < endpage; page += pageblock_nr_pages) {
1488 int mt = get_pageblock_migratetype(page); 1488 int mt = get_pageblock_migratetype(page);
1489 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1489 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1490 set_pageblock_migratetype(page, 1490 set_pageblock_migratetype(page,
1491 MIGRATE_MOVABLE); 1491 MIGRATE_MOVABLE);
1492 } 1492 }
1493 } 1493 }
1494 1494
1495 return 1UL << order; 1495 return 1UL << order;
1496 } 1496 }
1497 1497
1498 /* 1498 /*
1499 * Similar to split_page except the page is already free. As this is only 1499 * Similar to split_page except the page is already free. As this is only
1500 * being used for migration, the migratetype of the block also changes. 1500 * being used for migration, the migratetype of the block also changes.
1501 * As this is called with interrupts disabled, the caller is responsible 1501 * As this is called with interrupts disabled, the caller is responsible
1502 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1502 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1503 * are enabled. 1503 * are enabled.
1504 * 1504 *
1505 * Note: this is probably too low level an operation for use in drivers. 1505 * Note: this is probably too low level an operation for use in drivers.
1506 * Please consult with lkml before using this in your driver. 1506 * Please consult with lkml before using this in your driver.
1507 */ 1507 */
1508 int split_free_page(struct page *page) 1508 int split_free_page(struct page *page)
1509 { 1509 {
1510 unsigned int order; 1510 unsigned int order;
1511 int nr_pages; 1511 int nr_pages;
1512 1512
1513 order = page_order(page); 1513 order = page_order(page);
1514 1514
1515 nr_pages = __isolate_free_page(page, order); 1515 nr_pages = __isolate_free_page(page, order);
1516 if (!nr_pages) 1516 if (!nr_pages)
1517 return 0; 1517 return 0;
1518 1518
1519 /* Split into individual pages */ 1519 /* Split into individual pages */
1520 set_page_refcounted(page); 1520 set_page_refcounted(page);
1521 split_page(page, order); 1521 split_page(page, order);
1522 return nr_pages; 1522 return nr_pages;
1523 } 1523 }
1524 1524
1525 /* 1525 /*
1526 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1526 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1527 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1527 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1528 * or two. 1528 * or two.
1529 */ 1529 */
1530 static inline 1530 static inline
1531 struct page *buffered_rmqueue(struct zone *preferred_zone, 1531 struct page *buffered_rmqueue(struct zone *preferred_zone,
1532 struct zone *zone, unsigned int order, 1532 struct zone *zone, unsigned int order,
1533 gfp_t gfp_flags, int migratetype) 1533 gfp_t gfp_flags, int migratetype)
1534 { 1534 {
1535 unsigned long flags; 1535 unsigned long flags;
1536 struct page *page; 1536 struct page *page;
1537 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1537 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1538 1538
1539 again: 1539 again:
1540 if (likely(order == 0)) { 1540 if (likely(order == 0)) {
1541 struct per_cpu_pages *pcp; 1541 struct per_cpu_pages *pcp;
1542 struct list_head *list; 1542 struct list_head *list;
1543 1543
1544 local_irq_save(flags); 1544 local_irq_save(flags);
1545 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1545 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1546 list = &pcp->lists[migratetype]; 1546 list = &pcp->lists[migratetype];
1547 if (list_empty(list)) { 1547 if (list_empty(list)) {
1548 pcp->count += rmqueue_bulk(zone, 0, 1548 pcp->count += rmqueue_bulk(zone, 0,
1549 pcp->batch, list, 1549 pcp->batch, list,
1550 migratetype, cold); 1550 migratetype, cold);
1551 if (unlikely(list_empty(list))) 1551 if (unlikely(list_empty(list)))
1552 goto failed; 1552 goto failed;
1553 } 1553 }
1554 1554
1555 if (cold) 1555 if (cold)
1556 page = list_entry(list->prev, struct page, lru); 1556 page = list_entry(list->prev, struct page, lru);
1557 else 1557 else
1558 page = list_entry(list->next, struct page, lru); 1558 page = list_entry(list->next, struct page, lru);
1559 1559
1560 list_del(&page->lru); 1560 list_del(&page->lru);
1561 pcp->count--; 1561 pcp->count--;
1562 } else { 1562 } else {
1563 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1563 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1564 /* 1564 /*
1565 * __GFP_NOFAIL is not to be used in new code. 1565 * __GFP_NOFAIL is not to be used in new code.
1566 * 1566 *
1567 * All __GFP_NOFAIL callers should be fixed so that they 1567 * All __GFP_NOFAIL callers should be fixed so that they
1568 * properly detect and handle allocation failures. 1568 * properly detect and handle allocation failures.
1569 * 1569 *
1570 * We most definitely don't want callers attempting to 1570 * We most definitely don't want callers attempting to
1571 * allocate greater than order-1 page units with 1571 * allocate greater than order-1 page units with
1572 * __GFP_NOFAIL. 1572 * __GFP_NOFAIL.
1573 */ 1573 */
1574 WARN_ON_ONCE(order > 1); 1574 WARN_ON_ONCE(order > 1);
1575 } 1575 }
1576 spin_lock_irqsave(&zone->lock, flags); 1576 spin_lock_irqsave(&zone->lock, flags);
1577 page = __rmqueue(zone, order, migratetype); 1577 page = __rmqueue(zone, order, migratetype);
1578 spin_unlock(&zone->lock); 1578 spin_unlock(&zone->lock);
1579 if (!page) 1579 if (!page)
1580 goto failed; 1580 goto failed;
1581 __mod_zone_freepage_state(zone, -(1 << order), 1581 __mod_zone_freepage_state(zone, -(1 << order),
1582 get_freepage_migratetype(page)); 1582 get_freepage_migratetype(page));
1583 } 1583 }
1584 1584
1585 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1585 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1586 1586
1587 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1587 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1588 zone_statistics(preferred_zone, zone, gfp_flags); 1588 zone_statistics(preferred_zone, zone, gfp_flags);
1589 local_irq_restore(flags); 1589 local_irq_restore(flags);
1590 1590
1591 VM_BUG_ON(bad_range(zone, page)); 1591 VM_BUG_ON(bad_range(zone, page));
1592 if (prep_new_page(page, order, gfp_flags)) 1592 if (prep_new_page(page, order, gfp_flags))
1593 goto again; 1593 goto again;
1594 return page; 1594 return page;
1595 1595
1596 failed: 1596 failed:
1597 local_irq_restore(flags); 1597 local_irq_restore(flags);
1598 return NULL; 1598 return NULL;
1599 } 1599 }
1600 1600
1601 #ifdef CONFIG_FAIL_PAGE_ALLOC 1601 #ifdef CONFIG_FAIL_PAGE_ALLOC
1602 1602
1603 static struct { 1603 static struct {
1604 struct fault_attr attr; 1604 struct fault_attr attr;
1605 1605
1606 u32 ignore_gfp_highmem; 1606 u32 ignore_gfp_highmem;
1607 u32 ignore_gfp_wait; 1607 u32 ignore_gfp_wait;
1608 u32 min_order; 1608 u32 min_order;
1609 } fail_page_alloc = { 1609 } fail_page_alloc = {
1610 .attr = FAULT_ATTR_INITIALIZER, 1610 .attr = FAULT_ATTR_INITIALIZER,
1611 .ignore_gfp_wait = 1, 1611 .ignore_gfp_wait = 1,
1612 .ignore_gfp_highmem = 1, 1612 .ignore_gfp_highmem = 1,
1613 .min_order = 1, 1613 .min_order = 1,
1614 }; 1614 };
1615 1615
1616 static int __init setup_fail_page_alloc(char *str) 1616 static int __init setup_fail_page_alloc(char *str)
1617 { 1617 {
1618 return setup_fault_attr(&fail_page_alloc.attr, str); 1618 return setup_fault_attr(&fail_page_alloc.attr, str);
1619 } 1619 }
1620 __setup("fail_page_alloc=", setup_fail_page_alloc); 1620 __setup("fail_page_alloc=", setup_fail_page_alloc);
1621 1621
1622 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1622 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1623 { 1623 {
1624 if (order < fail_page_alloc.min_order) 1624 if (order < fail_page_alloc.min_order)
1625 return false; 1625 return false;
1626 if (gfp_mask & __GFP_NOFAIL) 1626 if (gfp_mask & __GFP_NOFAIL)
1627 return false; 1627 return false;
1628 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1628 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1629 return false; 1629 return false;
1630 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1630 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1631 return false; 1631 return false;
1632 1632
1633 return should_fail(&fail_page_alloc.attr, 1 << order); 1633 return should_fail(&fail_page_alloc.attr, 1 << order);
1634 } 1634 }
1635 1635
1636 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1636 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1637 1637
1638 static int __init fail_page_alloc_debugfs(void) 1638 static int __init fail_page_alloc_debugfs(void)
1639 { 1639 {
1640 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1640 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1641 struct dentry *dir; 1641 struct dentry *dir;
1642 1642
1643 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1643 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1644 &fail_page_alloc.attr); 1644 &fail_page_alloc.attr);
1645 if (IS_ERR(dir)) 1645 if (IS_ERR(dir))
1646 return PTR_ERR(dir); 1646 return PTR_ERR(dir);
1647 1647
1648 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1648 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1649 &fail_page_alloc.ignore_gfp_wait)) 1649 &fail_page_alloc.ignore_gfp_wait))
1650 goto fail; 1650 goto fail;
1651 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1651 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1652 &fail_page_alloc.ignore_gfp_highmem)) 1652 &fail_page_alloc.ignore_gfp_highmem))
1653 goto fail; 1653 goto fail;
1654 if (!debugfs_create_u32("min-order", mode, dir, 1654 if (!debugfs_create_u32("min-order", mode, dir,
1655 &fail_page_alloc.min_order)) 1655 &fail_page_alloc.min_order))
1656 goto fail; 1656 goto fail;
1657 1657
1658 return 0; 1658 return 0;
1659 fail: 1659 fail:
1660 debugfs_remove_recursive(dir); 1660 debugfs_remove_recursive(dir);
1661 1661
1662 return -ENOMEM; 1662 return -ENOMEM;
1663 } 1663 }
1664 1664
1665 late_initcall(fail_page_alloc_debugfs); 1665 late_initcall(fail_page_alloc_debugfs);
1666 1666
1667 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1667 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1668 1668
1669 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1669 #else /* CONFIG_FAIL_PAGE_ALLOC */
1670 1670
1671 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1671 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1672 { 1672 {
1673 return false; 1673 return false;
1674 } 1674 }
1675 1675
1676 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1676 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1677 1677
1678 /* 1678 /*
1679 * Return true if free pages are above 'mark'. This takes into account the order 1679 * Return true if free pages are above 'mark'. This takes into account the order
1680 * of the allocation. 1680 * of the allocation.
1681 */ 1681 */
1682 static bool __zone_watermark_ok(struct zone *z, unsigned int order, 1682 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1683 unsigned long mark, int classzone_idx, int alloc_flags, 1683 unsigned long mark, int classzone_idx, int alloc_flags,
1684 long free_pages) 1684 long free_pages)
1685 { 1685 {
1686 /* free_pages my go negative - that's OK */ 1686 /* free_pages my go negative - that's OK */
1687 long min = mark; 1687 long min = mark;
1688 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1688 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1689 int o; 1689 int o;
1690 long free_cma = 0; 1690 long free_cma = 0;
1691 1691
1692 free_pages -= (1 << order) - 1; 1692 free_pages -= (1 << order) - 1;
1693 if (alloc_flags & ALLOC_HIGH) 1693 if (alloc_flags & ALLOC_HIGH)
1694 min -= min / 2; 1694 min -= min / 2;
1695 if (alloc_flags & ALLOC_HARDER) 1695 if (alloc_flags & ALLOC_HARDER)
1696 min -= min / 4; 1696 min -= min / 4;
1697 #ifdef CONFIG_CMA 1697 #ifdef CONFIG_CMA
1698 /* If allocation can't use CMA areas don't use free CMA pages */ 1698 /* If allocation can't use CMA areas don't use free CMA pages */
1699 if (!(alloc_flags & ALLOC_CMA)) 1699 if (!(alloc_flags & ALLOC_CMA))
1700 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1700 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1701 #endif 1701 #endif
1702 1702
1703 if (free_pages - free_cma <= min + lowmem_reserve) 1703 if (free_pages - free_cma <= min + lowmem_reserve)
1704 return false; 1704 return false;
1705 for (o = 0; o < order; o++) { 1705 for (o = 0; o < order; o++) {
1706 /* At the next order, this order's pages become unavailable */ 1706 /* At the next order, this order's pages become unavailable */
1707 free_pages -= z->free_area[o].nr_free << o; 1707 free_pages -= z->free_area[o].nr_free << o;
1708 1708
1709 /* Require fewer higher order pages to be free */ 1709 /* Require fewer higher order pages to be free */
1710 min >>= 1; 1710 min >>= 1;
1711 1711
1712 if (free_pages <= min) 1712 if (free_pages <= min)
1713 return false; 1713 return false;
1714 } 1714 }
1715 return true; 1715 return true;
1716 } 1716 }
1717 1717
1718 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 1718 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1719 int classzone_idx, int alloc_flags) 1719 int classzone_idx, int alloc_flags)
1720 { 1720 {
1721 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1721 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1722 zone_page_state(z, NR_FREE_PAGES)); 1722 zone_page_state(z, NR_FREE_PAGES));
1723 } 1723 }
1724 1724
1725 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 1725 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1726 unsigned long mark, int classzone_idx, int alloc_flags) 1726 unsigned long mark, int classzone_idx, int alloc_flags)
1727 { 1727 {
1728 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1728 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1729 1729
1730 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1730 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1731 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1731 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1732 1732
1733 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1733 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1734 free_pages); 1734 free_pages);
1735 } 1735 }
1736 1736
1737 #ifdef CONFIG_NUMA 1737 #ifdef CONFIG_NUMA
1738 /* 1738 /*
1739 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1739 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1740 * skip over zones that are not allowed by the cpuset, or that have 1740 * skip over zones that are not allowed by the cpuset, or that have
1741 * been recently (in last second) found to be nearly full. See further 1741 * been recently (in last second) found to be nearly full. See further
1742 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1742 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1743 * that have to skip over a lot of full or unallowed zones. 1743 * that have to skip over a lot of full or unallowed zones.
1744 * 1744 *
1745 * If the zonelist cache is present in the passed in zonelist, then 1745 * If the zonelist cache is present in the passed in zonelist, then
1746 * returns a pointer to the allowed node mask (either the current 1746 * returns a pointer to the allowed node mask (either the current
1747 * tasks mems_allowed, or node_states[N_MEMORY].) 1747 * tasks mems_allowed, or node_states[N_MEMORY].)
1748 * 1748 *
1749 * If the zonelist cache is not available for this zonelist, does 1749 * If the zonelist cache is not available for this zonelist, does
1750 * nothing and returns NULL. 1750 * nothing and returns NULL.
1751 * 1751 *
1752 * If the fullzones BITMAP in the zonelist cache is stale (more than 1752 * If the fullzones BITMAP in the zonelist cache is stale (more than
1753 * a second since last zap'd) then we zap it out (clear its bits.) 1753 * a second since last zap'd) then we zap it out (clear its bits.)
1754 * 1754 *
1755 * We hold off even calling zlc_setup, until after we've checked the 1755 * We hold off even calling zlc_setup, until after we've checked the
1756 * first zone in the zonelist, on the theory that most allocations will 1756 * first zone in the zonelist, on the theory that most allocations will
1757 * be satisfied from that first zone, so best to examine that zone as 1757 * be satisfied from that first zone, so best to examine that zone as
1758 * quickly as we can. 1758 * quickly as we can.
1759 */ 1759 */
1760 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1760 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1761 { 1761 {
1762 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1762 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1763 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1763 nodemask_t *allowednodes; /* zonelist_cache approximation */
1764 1764
1765 zlc = zonelist->zlcache_ptr; 1765 zlc = zonelist->zlcache_ptr;
1766 if (!zlc) 1766 if (!zlc)
1767 return NULL; 1767 return NULL;
1768 1768
1769 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1769 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1770 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1770 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1771 zlc->last_full_zap = jiffies; 1771 zlc->last_full_zap = jiffies;
1772 } 1772 }
1773 1773
1774 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1774 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1775 &cpuset_current_mems_allowed : 1775 &cpuset_current_mems_allowed :
1776 &node_states[N_MEMORY]; 1776 &node_states[N_MEMORY];
1777 return allowednodes; 1777 return allowednodes;
1778 } 1778 }
1779 1779
1780 /* 1780 /*
1781 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1781 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1782 * if it is worth looking at further for free memory: 1782 * if it is worth looking at further for free memory:
1783 * 1) Check that the zone isn't thought to be full (doesn't have its 1783 * 1) Check that the zone isn't thought to be full (doesn't have its
1784 * bit set in the zonelist_cache fullzones BITMAP). 1784 * bit set in the zonelist_cache fullzones BITMAP).
1785 * 2) Check that the zones node (obtained from the zonelist_cache 1785 * 2) Check that the zones node (obtained from the zonelist_cache
1786 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1786 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1787 * Return true (non-zero) if zone is worth looking at further, or 1787 * Return true (non-zero) if zone is worth looking at further, or
1788 * else return false (zero) if it is not. 1788 * else return false (zero) if it is not.
1789 * 1789 *
1790 * This check -ignores- the distinction between various watermarks, 1790 * This check -ignores- the distinction between various watermarks,
1791 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1791 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1792 * found to be full for any variation of these watermarks, it will 1792 * found to be full for any variation of these watermarks, it will
1793 * be considered full for up to one second by all requests, unless 1793 * be considered full for up to one second by all requests, unless
1794 * we are so low on memory on all allowed nodes that we are forced 1794 * we are so low on memory on all allowed nodes that we are forced
1795 * into the second scan of the zonelist. 1795 * into the second scan of the zonelist.
1796 * 1796 *
1797 * In the second scan we ignore this zonelist cache and exactly 1797 * In the second scan we ignore this zonelist cache and exactly
1798 * apply the watermarks to all zones, even it is slower to do so. 1798 * apply the watermarks to all zones, even it is slower to do so.
1799 * We are low on memory in the second scan, and should leave no stone 1799 * We are low on memory in the second scan, and should leave no stone
1800 * unturned looking for a free page. 1800 * unturned looking for a free page.
1801 */ 1801 */
1802 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1802 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1803 nodemask_t *allowednodes) 1803 nodemask_t *allowednodes)
1804 { 1804 {
1805 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1805 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1806 int i; /* index of *z in zonelist zones */ 1806 int i; /* index of *z in zonelist zones */
1807 int n; /* node that zone *z is on */ 1807 int n; /* node that zone *z is on */
1808 1808
1809 zlc = zonelist->zlcache_ptr; 1809 zlc = zonelist->zlcache_ptr;
1810 if (!zlc) 1810 if (!zlc)
1811 return 1; 1811 return 1;
1812 1812
1813 i = z - zonelist->_zonerefs; 1813 i = z - zonelist->_zonerefs;
1814 n = zlc->z_to_n[i]; 1814 n = zlc->z_to_n[i];
1815 1815
1816 /* This zone is worth trying if it is allowed but not full */ 1816 /* This zone is worth trying if it is allowed but not full */
1817 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1817 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1818 } 1818 }
1819 1819
1820 /* 1820 /*
1821 * Given 'z' scanning a zonelist, set the corresponding bit in 1821 * Given 'z' scanning a zonelist, set the corresponding bit in
1822 * zlc->fullzones, so that subsequent attempts to allocate a page 1822 * zlc->fullzones, so that subsequent attempts to allocate a page
1823 * from that zone don't waste time re-examining it. 1823 * from that zone don't waste time re-examining it.
1824 */ 1824 */
1825 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1825 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1826 { 1826 {
1827 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1827 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1828 int i; /* index of *z in zonelist zones */ 1828 int i; /* index of *z in zonelist zones */
1829 1829
1830 zlc = zonelist->zlcache_ptr; 1830 zlc = zonelist->zlcache_ptr;
1831 if (!zlc) 1831 if (!zlc)
1832 return; 1832 return;
1833 1833
1834 i = z - zonelist->_zonerefs; 1834 i = z - zonelist->_zonerefs;
1835 1835
1836 set_bit(i, zlc->fullzones); 1836 set_bit(i, zlc->fullzones);
1837 } 1837 }
1838 1838
1839 /* 1839 /*
1840 * clear all zones full, called after direct reclaim makes progress so that 1840 * clear all zones full, called after direct reclaim makes progress so that
1841 * a zone that was recently full is not skipped over for up to a second 1841 * a zone that was recently full is not skipped over for up to a second
1842 */ 1842 */
1843 static void zlc_clear_zones_full(struct zonelist *zonelist) 1843 static void zlc_clear_zones_full(struct zonelist *zonelist)
1844 { 1844 {
1845 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1845 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1846 1846
1847 zlc = zonelist->zlcache_ptr; 1847 zlc = zonelist->zlcache_ptr;
1848 if (!zlc) 1848 if (!zlc)
1849 return; 1849 return;
1850 1850
1851 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1851 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1852 } 1852 }
1853 1853
1854 static bool zone_local(struct zone *local_zone, struct zone *zone) 1854 static bool zone_local(struct zone *local_zone, struct zone *zone)
1855 { 1855 {
1856 return local_zone->node == zone->node; 1856 return local_zone->node == zone->node;
1857 } 1857 }
1858 1858
1859 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1859 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1860 { 1860 {
1861 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1861 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1862 } 1862 }
1863 1863
1864 static void __paginginit init_zone_allows_reclaim(int nid) 1864 static void __paginginit init_zone_allows_reclaim(int nid)
1865 { 1865 {
1866 int i; 1866 int i;
1867 1867
1868 for_each_node_state(i, N_MEMORY) 1868 for_each_node_state(i, N_MEMORY)
1869 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1869 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1870 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1870 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1871 else 1871 else
1872 zone_reclaim_mode = 1; 1872 zone_reclaim_mode = 1;
1873 } 1873 }
1874 1874
1875 #else /* CONFIG_NUMA */ 1875 #else /* CONFIG_NUMA */
1876 1876
1877 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1877 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1878 { 1878 {
1879 return NULL; 1879 return NULL;
1880 } 1880 }
1881 1881
1882 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1882 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1883 nodemask_t *allowednodes) 1883 nodemask_t *allowednodes)
1884 { 1884 {
1885 return 1; 1885 return 1;
1886 } 1886 }
1887 1887
1888 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1888 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1889 { 1889 {
1890 } 1890 }
1891 1891
1892 static void zlc_clear_zones_full(struct zonelist *zonelist) 1892 static void zlc_clear_zones_full(struct zonelist *zonelist)
1893 { 1893 {
1894 } 1894 }
1895 1895
1896 static bool zone_local(struct zone *local_zone, struct zone *zone) 1896 static bool zone_local(struct zone *local_zone, struct zone *zone)
1897 { 1897 {
1898 return true; 1898 return true;
1899 } 1899 }
1900 1900
1901 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1901 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1902 { 1902 {
1903 return true; 1903 return true;
1904 } 1904 }
1905 1905
1906 static inline void init_zone_allows_reclaim(int nid) 1906 static inline void init_zone_allows_reclaim(int nid)
1907 { 1907 {
1908 } 1908 }
1909 #endif /* CONFIG_NUMA */ 1909 #endif /* CONFIG_NUMA */
1910 1910
1911 /* 1911 /*
1912 * get_page_from_freelist goes through the zonelist trying to allocate 1912 * get_page_from_freelist goes through the zonelist trying to allocate
1913 * a page. 1913 * a page.
1914 */ 1914 */
1915 static struct page * 1915 static struct page *
1916 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1916 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1917 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1917 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1918 struct zone *preferred_zone, int classzone_idx, int migratetype) 1918 struct zone *preferred_zone, int classzone_idx, int migratetype)
1919 { 1919 {
1920 struct zoneref *z; 1920 struct zoneref *z;
1921 struct page *page = NULL; 1921 struct page *page = NULL;
1922 struct zone *zone; 1922 struct zone *zone;
1923 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1923 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1924 int zlc_active = 0; /* set if using zonelist_cache */ 1924 int zlc_active = 0; /* set if using zonelist_cache */
1925 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1925 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1926 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1926 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1927 (gfp_mask & __GFP_WRITE); 1927 (gfp_mask & __GFP_WRITE);
1928 1928
1929 zonelist_scan: 1929 zonelist_scan:
1930 /* 1930 /*
1931 * Scan zonelist, looking for a zone with enough free. 1931 * Scan zonelist, looking for a zone with enough free.
1932 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1932 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1933 */ 1933 */
1934 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1934 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1935 high_zoneidx, nodemask) { 1935 high_zoneidx, nodemask) {
1936 unsigned long mark; 1936 unsigned long mark;
1937 1937
1938 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1938 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1939 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1939 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1940 continue; 1940 continue;
1941 if (cpusets_enabled() && 1941 if (cpusets_enabled() &&
1942 (alloc_flags & ALLOC_CPUSET) && 1942 (alloc_flags & ALLOC_CPUSET) &&
1943 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1943 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1944 continue; 1944 continue;
1945 /* 1945 /*
1946 * Distribute pages in proportion to the individual 1946 * Distribute pages in proportion to the individual
1947 * zone size to ensure fair page aging. The zone a 1947 * zone size to ensure fair page aging. The zone a
1948 * page was allocated in should have no effect on the 1948 * page was allocated in should have no effect on the
1949 * time the page has in memory before being reclaimed. 1949 * time the page has in memory before being reclaimed.
1950 */ 1950 */
1951 if (alloc_flags & ALLOC_FAIR) { 1951 if (alloc_flags & ALLOC_FAIR) {
1952 if (!zone_local(preferred_zone, zone)) 1952 if (!zone_local(preferred_zone, zone))
1953 continue; 1953 continue;
1954 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1954 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1955 continue; 1955 continue;
1956 } 1956 }
1957 /* 1957 /*
1958 * When allocating a page cache page for writing, we 1958 * When allocating a page cache page for writing, we
1959 * want to get it from a zone that is within its dirty 1959 * want to get it from a zone that is within its dirty
1960 * limit, such that no single zone holds more than its 1960 * limit, such that no single zone holds more than its
1961 * proportional share of globally allowed dirty pages. 1961 * proportional share of globally allowed dirty pages.
1962 * The dirty limits take into account the zone's 1962 * The dirty limits take into account the zone's
1963 * lowmem reserves and high watermark so that kswapd 1963 * lowmem reserves and high watermark so that kswapd
1964 * should be able to balance it without having to 1964 * should be able to balance it without having to
1965 * write pages from its LRU list. 1965 * write pages from its LRU list.
1966 * 1966 *
1967 * This may look like it could increase pressure on 1967 * This may look like it could increase pressure on
1968 * lower zones by failing allocations in higher zones 1968 * lower zones by failing allocations in higher zones
1969 * before they are full. But the pages that do spill 1969 * before they are full. But the pages that do spill
1970 * over are limited as the lower zones are protected 1970 * over are limited as the lower zones are protected
1971 * by this very same mechanism. It should not become 1971 * by this very same mechanism. It should not become
1972 * a practical burden to them. 1972 * a practical burden to them.
1973 * 1973 *
1974 * XXX: For now, allow allocations to potentially 1974 * XXX: For now, allow allocations to potentially
1975 * exceed the per-zone dirty limit in the slowpath 1975 * exceed the per-zone dirty limit in the slowpath
1976 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1976 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1977 * which is important when on a NUMA setup the allowed 1977 * which is important when on a NUMA setup the allowed
1978 * zones are together not big enough to reach the 1978 * zones are together not big enough to reach the
1979 * global limit. The proper fix for these situations 1979 * global limit. The proper fix for these situations
1980 * will require awareness of zones in the 1980 * will require awareness of zones in the
1981 * dirty-throttling and the flusher threads. 1981 * dirty-throttling and the flusher threads.
1982 */ 1982 */
1983 if (consider_zone_dirty && !zone_dirty_ok(zone)) 1983 if (consider_zone_dirty && !zone_dirty_ok(zone))
1984 continue; 1984 continue;
1985 1985
1986 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1986 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1987 if (!zone_watermark_ok(zone, order, mark, 1987 if (!zone_watermark_ok(zone, order, mark,
1988 classzone_idx, alloc_flags)) { 1988 classzone_idx, alloc_flags)) {
1989 int ret; 1989 int ret;
1990 1990
1991 /* Checked here to keep the fast path fast */ 1991 /* Checked here to keep the fast path fast */
1992 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1992 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1993 if (alloc_flags & ALLOC_NO_WATERMARKS) 1993 if (alloc_flags & ALLOC_NO_WATERMARKS)
1994 goto try_this_zone; 1994 goto try_this_zone;
1995 1995
1996 if (IS_ENABLED(CONFIG_NUMA) && 1996 if (IS_ENABLED(CONFIG_NUMA) &&
1997 !did_zlc_setup && nr_online_nodes > 1) { 1997 !did_zlc_setup && nr_online_nodes > 1) {
1998 /* 1998 /*
1999 * we do zlc_setup if there are multiple nodes 1999 * we do zlc_setup if there are multiple nodes
2000 * and before considering the first zone allowed 2000 * and before considering the first zone allowed
2001 * by the cpuset. 2001 * by the cpuset.
2002 */ 2002 */
2003 allowednodes = zlc_setup(zonelist, alloc_flags); 2003 allowednodes = zlc_setup(zonelist, alloc_flags);
2004 zlc_active = 1; 2004 zlc_active = 1;
2005 did_zlc_setup = 1; 2005 did_zlc_setup = 1;
2006 } 2006 }
2007 2007
2008 if (zone_reclaim_mode == 0 || 2008 if (zone_reclaim_mode == 0 ||
2009 !zone_allows_reclaim(preferred_zone, zone)) 2009 !zone_allows_reclaim(preferred_zone, zone))
2010 goto this_zone_full; 2010 goto this_zone_full;
2011 2011
2012 /* 2012 /*
2013 * As we may have just activated ZLC, check if the first 2013 * As we may have just activated ZLC, check if the first
2014 * eligible zone has failed zone_reclaim recently. 2014 * eligible zone has failed zone_reclaim recently.
2015 */ 2015 */
2016 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2016 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
2017 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 2017 !zlc_zone_worth_trying(zonelist, z, allowednodes))
2018 continue; 2018 continue;
2019 2019
2020 ret = zone_reclaim(zone, gfp_mask, order); 2020 ret = zone_reclaim(zone, gfp_mask, order);
2021 switch (ret) { 2021 switch (ret) {
2022 case ZONE_RECLAIM_NOSCAN: 2022 case ZONE_RECLAIM_NOSCAN:
2023 /* did not scan */ 2023 /* did not scan */
2024 continue; 2024 continue;
2025 case ZONE_RECLAIM_FULL: 2025 case ZONE_RECLAIM_FULL:
2026 /* scanned but unreclaimable */ 2026 /* scanned but unreclaimable */
2027 continue; 2027 continue;
2028 default: 2028 default:
2029 /* did we reclaim enough */ 2029 /* did we reclaim enough */
2030 if (zone_watermark_ok(zone, order, mark, 2030 if (zone_watermark_ok(zone, order, mark,
2031 classzone_idx, alloc_flags)) 2031 classzone_idx, alloc_flags))
2032 goto try_this_zone; 2032 goto try_this_zone;
2033 2033
2034 /* 2034 /*
2035 * Failed to reclaim enough to meet watermark. 2035 * Failed to reclaim enough to meet watermark.
2036 * Only mark the zone full if checking the min 2036 * Only mark the zone full if checking the min
2037 * watermark or if we failed to reclaim just 2037 * watermark or if we failed to reclaim just
2038 * 1<<order pages or else the page allocator 2038 * 1<<order pages or else the page allocator
2039 * fastpath will prematurely mark zones full 2039 * fastpath will prematurely mark zones full
2040 * when the watermark is between the low and 2040 * when the watermark is between the low and
2041 * min watermarks. 2041 * min watermarks.
2042 */ 2042 */
2043 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2043 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2044 ret == ZONE_RECLAIM_SOME) 2044 ret == ZONE_RECLAIM_SOME)
2045 goto this_zone_full; 2045 goto this_zone_full;
2046 2046
2047 continue; 2047 continue;
2048 } 2048 }
2049 } 2049 }
2050 2050
2051 try_this_zone: 2051 try_this_zone:
2052 page = buffered_rmqueue(preferred_zone, zone, order, 2052 page = buffered_rmqueue(preferred_zone, zone, order,
2053 gfp_mask, migratetype); 2053 gfp_mask, migratetype);
2054 if (page) 2054 if (page)
2055 break; 2055 break;
2056 this_zone_full: 2056 this_zone_full:
2057 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2057 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2058 zlc_mark_zone_full(zonelist, z); 2058 zlc_mark_zone_full(zonelist, z);
2059 } 2059 }
2060 2060
2061 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2061 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2062 /* Disable zlc cache for second zonelist scan */ 2062 /* Disable zlc cache for second zonelist scan */
2063 zlc_active = 0; 2063 zlc_active = 0;
2064 goto zonelist_scan; 2064 goto zonelist_scan;
2065 } 2065 }
2066 2066
2067 if (page) 2067 if (page)
2068 /* 2068 /*
2069 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2069 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2070 * necessary to allocate the page. The expectation is 2070 * necessary to allocate the page. The expectation is
2071 * that the caller is taking steps that will free more 2071 * that the caller is taking steps that will free more
2072 * memory. The caller should avoid the page being used 2072 * memory. The caller should avoid the page being used
2073 * for !PFMEMALLOC purposes. 2073 * for !PFMEMALLOC purposes.
2074 */ 2074 */
2075 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2075 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2076 2076
2077 return page; 2077 return page;
2078 } 2078 }
2079 2079
2080 /* 2080 /*
2081 * Large machines with many possible nodes should not always dump per-node 2081 * Large machines with many possible nodes should not always dump per-node
2082 * meminfo in irq context. 2082 * meminfo in irq context.
2083 */ 2083 */
2084 static inline bool should_suppress_show_mem(void) 2084 static inline bool should_suppress_show_mem(void)
2085 { 2085 {
2086 bool ret = false; 2086 bool ret = false;
2087 2087
2088 #if NODES_SHIFT > 8 2088 #if NODES_SHIFT > 8
2089 ret = in_interrupt(); 2089 ret = in_interrupt();
2090 #endif 2090 #endif
2091 return ret; 2091 return ret;
2092 } 2092 }
2093 2093
2094 static DEFINE_RATELIMIT_STATE(nopage_rs, 2094 static DEFINE_RATELIMIT_STATE(nopage_rs,
2095 DEFAULT_RATELIMIT_INTERVAL, 2095 DEFAULT_RATELIMIT_INTERVAL,
2096 DEFAULT_RATELIMIT_BURST); 2096 DEFAULT_RATELIMIT_BURST);
2097 2097
2098 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2098 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2099 { 2099 {
2100 unsigned int filter = SHOW_MEM_FILTER_NODES; 2100 unsigned int filter = SHOW_MEM_FILTER_NODES;
2101 2101
2102 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2102 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2103 debug_guardpage_minorder() > 0) 2103 debug_guardpage_minorder() > 0)
2104 return; 2104 return;
2105 2105
2106 /* 2106 /*
2107 * Walking all memory to count page types is very expensive and should 2107 * Walking all memory to count page types is very expensive and should
2108 * be inhibited in non-blockable contexts. 2108 * be inhibited in non-blockable contexts.
2109 */ 2109 */
2110 if (!(gfp_mask & __GFP_WAIT)) 2110 if (!(gfp_mask & __GFP_WAIT))
2111 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2111 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2112 2112
2113 /* 2113 /*
2114 * This documents exceptions given to allocations in certain 2114 * This documents exceptions given to allocations in certain
2115 * contexts that are allowed to allocate outside current's set 2115 * contexts that are allowed to allocate outside current's set
2116 * of allowed nodes. 2116 * of allowed nodes.
2117 */ 2117 */
2118 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2118 if (!(gfp_mask & __GFP_NOMEMALLOC))
2119 if (test_thread_flag(TIF_MEMDIE) || 2119 if (test_thread_flag(TIF_MEMDIE) ||
2120 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2120 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2121 filter &= ~SHOW_MEM_FILTER_NODES; 2121 filter &= ~SHOW_MEM_FILTER_NODES;
2122 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2122 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2123 filter &= ~SHOW_MEM_FILTER_NODES; 2123 filter &= ~SHOW_MEM_FILTER_NODES;
2124 2124
2125 if (fmt) { 2125 if (fmt) {
2126 struct va_format vaf; 2126 struct va_format vaf;
2127 va_list args; 2127 va_list args;
2128 2128
2129 va_start(args, fmt); 2129 va_start(args, fmt);
2130 2130
2131 vaf.fmt = fmt; 2131 vaf.fmt = fmt;
2132 vaf.va = &args; 2132 vaf.va = &args;
2133 2133
2134 pr_warn("%pV", &vaf); 2134 pr_warn("%pV", &vaf);
2135 2135
2136 va_end(args); 2136 va_end(args);
2137 } 2137 }
2138 2138
2139 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2139 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2140 current->comm, order, gfp_mask); 2140 current->comm, order, gfp_mask);
2141 2141
2142 dump_stack(); 2142 dump_stack();
2143 if (!should_suppress_show_mem()) 2143 if (!should_suppress_show_mem())
2144 show_mem(filter); 2144 show_mem(filter);
2145 } 2145 }
2146 2146
2147 static inline int 2147 static inline int
2148 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2148 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2149 unsigned long did_some_progress, 2149 unsigned long did_some_progress,
2150 unsigned long pages_reclaimed) 2150 unsigned long pages_reclaimed)
2151 { 2151 {
2152 /* Do not loop if specifically requested */ 2152 /* Do not loop if specifically requested */
2153 if (gfp_mask & __GFP_NORETRY) 2153 if (gfp_mask & __GFP_NORETRY)
2154 return 0; 2154 return 0;
2155 2155
2156 /* Always retry if specifically requested */ 2156 /* Always retry if specifically requested */
2157 if (gfp_mask & __GFP_NOFAIL) 2157 if (gfp_mask & __GFP_NOFAIL)
2158 return 1; 2158 return 1;
2159 2159
2160 /* 2160 /*
2161 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2161 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2162 * making forward progress without invoking OOM. Suspend also disables 2162 * making forward progress without invoking OOM. Suspend also disables
2163 * storage devices so kswapd will not help. Bail if we are suspending. 2163 * storage devices so kswapd will not help. Bail if we are suspending.
2164 */ 2164 */
2165 if (!did_some_progress && pm_suspended_storage()) 2165 if (!did_some_progress && pm_suspended_storage())
2166 return 0; 2166 return 0;
2167 2167
2168 /* 2168 /*
2169 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2169 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2170 * means __GFP_NOFAIL, but that may not be true in other 2170 * means __GFP_NOFAIL, but that may not be true in other
2171 * implementations. 2171 * implementations.
2172 */ 2172 */
2173 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2173 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2174 return 1; 2174 return 1;
2175 2175
2176 /* 2176 /*
2177 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2177 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2178 * specified, then we retry until we no longer reclaim any pages 2178 * specified, then we retry until we no longer reclaim any pages
2179 * (above), or we've reclaimed an order of pages at least as 2179 * (above), or we've reclaimed an order of pages at least as
2180 * large as the allocation's order. In both cases, if the 2180 * large as the allocation's order. In both cases, if the
2181 * allocation still fails, we stop retrying. 2181 * allocation still fails, we stop retrying.
2182 */ 2182 */
2183 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2183 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2184 return 1; 2184 return 1;
2185 2185
2186 return 0; 2186 return 0;
2187 } 2187 }
2188 2188
2189 static inline struct page * 2189 static inline struct page *
2190 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2190 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2191 struct zonelist *zonelist, enum zone_type high_zoneidx, 2191 struct zonelist *zonelist, enum zone_type high_zoneidx,
2192 nodemask_t *nodemask, struct zone *preferred_zone, 2192 nodemask_t *nodemask, struct zone *preferred_zone,
2193 int classzone_idx, int migratetype) 2193 int classzone_idx, int migratetype)
2194 { 2194 {
2195 struct page *page; 2195 struct page *page;
2196 2196
2197 /* Acquire the OOM killer lock for the zones in zonelist */ 2197 /* Acquire the OOM killer lock for the zones in zonelist */
2198 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2198 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2199 schedule_timeout_uninterruptible(1); 2199 schedule_timeout_uninterruptible(1);
2200 return NULL; 2200 return NULL;
2201 } 2201 }
2202 2202
2203 /* 2203 /*
2204 * Go through the zonelist yet one more time, keep very high watermark 2204 * Go through the zonelist yet one more time, keep very high watermark
2205 * here, this is only to catch a parallel oom killing, we must fail if 2205 * here, this is only to catch a parallel oom killing, we must fail if
2206 * we're still under heavy pressure. 2206 * we're still under heavy pressure.
2207 */ 2207 */
2208 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2208 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2209 order, zonelist, high_zoneidx, 2209 order, zonelist, high_zoneidx,
2210 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2210 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2211 preferred_zone, classzone_idx, migratetype); 2211 preferred_zone, classzone_idx, migratetype);
2212 if (page) 2212 if (page)
2213 goto out; 2213 goto out;
2214 2214
2215 if (!(gfp_mask & __GFP_NOFAIL)) { 2215 if (!(gfp_mask & __GFP_NOFAIL)) {
2216 /* The OOM killer will not help higher order allocs */ 2216 /* The OOM killer will not help higher order allocs */
2217 if (order > PAGE_ALLOC_COSTLY_ORDER) 2217 if (order > PAGE_ALLOC_COSTLY_ORDER)
2218 goto out; 2218 goto out;
2219 /* The OOM killer does not needlessly kill tasks for lowmem */ 2219 /* The OOM killer does not needlessly kill tasks for lowmem */
2220 if (high_zoneidx < ZONE_NORMAL) 2220 if (high_zoneidx < ZONE_NORMAL)
2221 goto out; 2221 goto out;
2222 /* 2222 /*
2223 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2223 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2224 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2224 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2225 * The caller should handle page allocation failure by itself if 2225 * The caller should handle page allocation failure by itself if
2226 * it specifies __GFP_THISNODE. 2226 * it specifies __GFP_THISNODE.
2227 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2227 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2228 */ 2228 */
2229 if (gfp_mask & __GFP_THISNODE) 2229 if (gfp_mask & __GFP_THISNODE)
2230 goto out; 2230 goto out;
2231 } 2231 }
2232 /* Exhausted what can be done so it's blamo time */ 2232 /* Exhausted what can be done so it's blamo time */
2233 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2233 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2234 2234
2235 out: 2235 out:
2236 clear_zonelist_oom(zonelist, gfp_mask); 2236 clear_zonelist_oom(zonelist, gfp_mask);
2237 return page; 2237 return page;
2238 } 2238 }
2239 2239
2240 #ifdef CONFIG_COMPACTION 2240 #ifdef CONFIG_COMPACTION
2241 /* Try memory compaction for high-order allocations before reclaim */ 2241 /* Try memory compaction for high-order allocations before reclaim */
2242 static struct page * 2242 static struct page *
2243 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2243 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2244 struct zonelist *zonelist, enum zone_type high_zoneidx, 2244 struct zonelist *zonelist, enum zone_type high_zoneidx,
2245 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2245 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2246 int classzone_idx, int migratetype, enum migrate_mode mode, 2246 int classzone_idx, int migratetype, enum migrate_mode mode,
2247 bool *contended_compaction, bool *deferred_compaction, 2247 bool *contended_compaction, bool *deferred_compaction,
2248 unsigned long *did_some_progress) 2248 unsigned long *did_some_progress)
2249 { 2249 {
2250 if (!order) 2250 if (!order)
2251 return NULL; 2251 return NULL;
2252 2252
2253 if (compaction_deferred(preferred_zone, order)) { 2253 if (compaction_deferred(preferred_zone, order)) {
2254 *deferred_compaction = true; 2254 *deferred_compaction = true;
2255 return NULL; 2255 return NULL;
2256 } 2256 }
2257 2257
2258 current->flags |= PF_MEMALLOC; 2258 current->flags |= PF_MEMALLOC;
2259 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2259 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2260 nodemask, mode, 2260 nodemask, mode,
2261 contended_compaction); 2261 contended_compaction);
2262 current->flags &= ~PF_MEMALLOC; 2262 current->flags &= ~PF_MEMALLOC;
2263 2263
2264 if (*did_some_progress != COMPACT_SKIPPED) { 2264 if (*did_some_progress != COMPACT_SKIPPED) {
2265 struct page *page; 2265 struct page *page;
2266 2266
2267 /* Page migration frees to the PCP lists but we want merging */ 2267 /* Page migration frees to the PCP lists but we want merging */
2268 drain_pages(get_cpu()); 2268 drain_pages(get_cpu());
2269 put_cpu(); 2269 put_cpu();
2270 2270
2271 page = get_page_from_freelist(gfp_mask, nodemask, 2271 page = get_page_from_freelist(gfp_mask, nodemask,
2272 order, zonelist, high_zoneidx, 2272 order, zonelist, high_zoneidx,
2273 alloc_flags & ~ALLOC_NO_WATERMARKS, 2273 alloc_flags & ~ALLOC_NO_WATERMARKS,
2274 preferred_zone, classzone_idx, migratetype); 2274 preferred_zone, classzone_idx, migratetype);
2275 if (page) { 2275 if (page) {
2276 preferred_zone->compact_blockskip_flush = false; 2276 preferred_zone->compact_blockskip_flush = false;
2277 compaction_defer_reset(preferred_zone, order, true); 2277 compaction_defer_reset(preferred_zone, order, true);
2278 count_vm_event(COMPACTSUCCESS); 2278 count_vm_event(COMPACTSUCCESS);
2279 return page; 2279 return page;
2280 } 2280 }
2281 2281
2282 /* 2282 /*
2283 * It's bad if compaction run occurs and fails. 2283 * It's bad if compaction run occurs and fails.
2284 * The most likely reason is that pages exist, 2284 * The most likely reason is that pages exist,
2285 * but not enough to satisfy watermarks. 2285 * but not enough to satisfy watermarks.
2286 */ 2286 */
2287 count_vm_event(COMPACTFAIL); 2287 count_vm_event(COMPACTFAIL);
2288 2288
2289 /* 2289 /*
2290 * As async compaction considers a subset of pageblocks, only 2290 * As async compaction considers a subset of pageblocks, only
2291 * defer if the failure was a sync compaction failure. 2291 * defer if the failure was a sync compaction failure.
2292 */ 2292 */
2293 if (mode != MIGRATE_ASYNC) 2293 if (mode != MIGRATE_ASYNC)
2294 defer_compaction(preferred_zone, order); 2294 defer_compaction(preferred_zone, order);
2295 2295
2296 cond_resched(); 2296 cond_resched();
2297 } 2297 }
2298 2298
2299 return NULL; 2299 return NULL;
2300 } 2300 }
2301 #else 2301 #else
2302 static inline struct page * 2302 static inline struct page *
2303 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2303 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2304 struct zonelist *zonelist, enum zone_type high_zoneidx, 2304 struct zonelist *zonelist, enum zone_type high_zoneidx,
2305 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2305 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2306 int classzone_idx, int migratetype, 2306 int classzone_idx, int migratetype,
2307 enum migrate_mode mode, bool *contended_compaction, 2307 enum migrate_mode mode, bool *contended_compaction,
2308 bool *deferred_compaction, unsigned long *did_some_progress) 2308 bool *deferred_compaction, unsigned long *did_some_progress)
2309 { 2309 {
2310 return NULL; 2310 return NULL;
2311 } 2311 }
2312 #endif /* CONFIG_COMPACTION */ 2312 #endif /* CONFIG_COMPACTION */
2313 2313
2314 /* Perform direct synchronous page reclaim */ 2314 /* Perform direct synchronous page reclaim */
2315 static int 2315 static int
2316 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2316 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2317 nodemask_t *nodemask) 2317 nodemask_t *nodemask)
2318 { 2318 {
2319 struct reclaim_state reclaim_state; 2319 struct reclaim_state reclaim_state;
2320 int progress; 2320 int progress;
2321 2321
2322 cond_resched(); 2322 cond_resched();
2323 2323
2324 /* We now go into synchronous reclaim */ 2324 /* We now go into synchronous reclaim */
2325 cpuset_memory_pressure_bump(); 2325 cpuset_memory_pressure_bump();
2326 current->flags |= PF_MEMALLOC; 2326 current->flags |= PF_MEMALLOC;
2327 lockdep_set_current_reclaim_state(gfp_mask); 2327 lockdep_set_current_reclaim_state(gfp_mask);
2328 reclaim_state.reclaimed_slab = 0; 2328 reclaim_state.reclaimed_slab = 0;
2329 current->reclaim_state = &reclaim_state; 2329 current->reclaim_state = &reclaim_state;
2330 2330
2331 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2331 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2332 2332
2333 current->reclaim_state = NULL; 2333 current->reclaim_state = NULL;
2334 lockdep_clear_current_reclaim_state(); 2334 lockdep_clear_current_reclaim_state();
2335 current->flags &= ~PF_MEMALLOC; 2335 current->flags &= ~PF_MEMALLOC;
2336 2336
2337 cond_resched(); 2337 cond_resched();
2338 2338
2339 return progress; 2339 return progress;
2340 } 2340 }
2341 2341
2342 /* The really slow allocator path where we enter direct reclaim */ 2342 /* The really slow allocator path where we enter direct reclaim */
2343 static inline struct page * 2343 static inline struct page *
2344 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2344 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2345 struct zonelist *zonelist, enum zone_type high_zoneidx, 2345 struct zonelist *zonelist, enum zone_type high_zoneidx,
2346 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2346 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2347 int classzone_idx, int migratetype, unsigned long *did_some_progress) 2347 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2348 { 2348 {
2349 struct page *page = NULL; 2349 struct page *page = NULL;
2350 bool drained = false; 2350 bool drained = false;
2351 2351
2352 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2352 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2353 nodemask); 2353 nodemask);
2354 if (unlikely(!(*did_some_progress))) 2354 if (unlikely(!(*did_some_progress)))
2355 return NULL; 2355 return NULL;
2356 2356
2357 /* After successful reclaim, reconsider all zones for allocation */ 2357 /* After successful reclaim, reconsider all zones for allocation */
2358 if (IS_ENABLED(CONFIG_NUMA)) 2358 if (IS_ENABLED(CONFIG_NUMA))
2359 zlc_clear_zones_full(zonelist); 2359 zlc_clear_zones_full(zonelist);
2360 2360
2361 retry: 2361 retry:
2362 page = get_page_from_freelist(gfp_mask, nodemask, order, 2362 page = get_page_from_freelist(gfp_mask, nodemask, order,
2363 zonelist, high_zoneidx, 2363 zonelist, high_zoneidx,
2364 alloc_flags & ~ALLOC_NO_WATERMARKS, 2364 alloc_flags & ~ALLOC_NO_WATERMARKS,
2365 preferred_zone, classzone_idx, 2365 preferred_zone, classzone_idx,
2366 migratetype); 2366 migratetype);
2367 2367
2368 /* 2368 /*
2369 * If an allocation failed after direct reclaim, it could be because 2369 * If an allocation failed after direct reclaim, it could be because
2370 * pages are pinned on the per-cpu lists. Drain them and try again 2370 * pages are pinned on the per-cpu lists. Drain them and try again
2371 */ 2371 */
2372 if (!page && !drained) { 2372 if (!page && !drained) {
2373 drain_all_pages(); 2373 drain_all_pages();
2374 drained = true; 2374 drained = true;
2375 goto retry; 2375 goto retry;
2376 } 2376 }
2377 2377
2378 return page; 2378 return page;
2379 } 2379 }
2380 2380
2381 /* 2381 /*
2382 * This is called in the allocator slow-path if the allocation request is of 2382 * This is called in the allocator slow-path if the allocation request is of
2383 * sufficient urgency to ignore watermarks and take other desperate measures 2383 * sufficient urgency to ignore watermarks and take other desperate measures
2384 */ 2384 */
2385 static inline struct page * 2385 static inline struct page *
2386 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2386 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2387 struct zonelist *zonelist, enum zone_type high_zoneidx, 2387 struct zonelist *zonelist, enum zone_type high_zoneidx,
2388 nodemask_t *nodemask, struct zone *preferred_zone, 2388 nodemask_t *nodemask, struct zone *preferred_zone,
2389 int classzone_idx, int migratetype) 2389 int classzone_idx, int migratetype)
2390 { 2390 {
2391 struct page *page; 2391 struct page *page;
2392 2392
2393 do { 2393 do {
2394 page = get_page_from_freelist(gfp_mask, nodemask, order, 2394 page = get_page_from_freelist(gfp_mask, nodemask, order,
2395 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2395 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2396 preferred_zone, classzone_idx, migratetype); 2396 preferred_zone, classzone_idx, migratetype);
2397 2397
2398 if (!page && gfp_mask & __GFP_NOFAIL) 2398 if (!page && gfp_mask & __GFP_NOFAIL)
2399 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2399 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2400 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2400 } while (!page && (gfp_mask & __GFP_NOFAIL));
2401 2401
2402 return page; 2402 return page;
2403 } 2403 }
2404 2404
2405 static void reset_alloc_batches(struct zonelist *zonelist, 2405 static void reset_alloc_batches(struct zonelist *zonelist,
2406 enum zone_type high_zoneidx, 2406 enum zone_type high_zoneidx,
2407 struct zone *preferred_zone) 2407 struct zone *preferred_zone)
2408 { 2408 {
2409 struct zoneref *z; 2409 struct zoneref *z;
2410 struct zone *zone; 2410 struct zone *zone;
2411 2411
2412 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2412 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2413 /* 2413 /*
2414 * Only reset the batches of zones that were actually 2414 * Only reset the batches of zones that were actually
2415 * considered in the fairness pass, we don't want to 2415 * considered in the fairness pass, we don't want to
2416 * trash fairness information for zones that are not 2416 * trash fairness information for zones that are not
2417 * actually part of this zonelist's round-robin cycle. 2417 * actually part of this zonelist's round-robin cycle.
2418 */ 2418 */
2419 if (!zone_local(preferred_zone, zone)) 2419 if (!zone_local(preferred_zone, zone))
2420 continue; 2420 continue;
2421 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2421 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2422 high_wmark_pages(zone) - low_wmark_pages(zone) - 2422 high_wmark_pages(zone) - low_wmark_pages(zone) -
2423 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2423 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2424 } 2424 }
2425 } 2425 }
2426 2426
2427 static void wake_all_kswapds(unsigned int order, 2427 static void wake_all_kswapds(unsigned int order,
2428 struct zonelist *zonelist, 2428 struct zonelist *zonelist,
2429 enum zone_type high_zoneidx, 2429 enum zone_type high_zoneidx,
2430 struct zone *preferred_zone) 2430 struct zone *preferred_zone)
2431 { 2431 {
2432 struct zoneref *z; 2432 struct zoneref *z;
2433 struct zone *zone; 2433 struct zone *zone;
2434 2434
2435 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2435 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2436 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2436 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2437 } 2437 }
2438 2438
2439 static inline int 2439 static inline int
2440 gfp_to_alloc_flags(gfp_t gfp_mask) 2440 gfp_to_alloc_flags(gfp_t gfp_mask)
2441 { 2441 {
2442 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2442 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2443 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); 2443 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
2444 2444
2445 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2445 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2446 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2446 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2447 2447
2448 /* 2448 /*
2449 * The caller may dip into page reserves a bit more if the caller 2449 * The caller may dip into page reserves a bit more if the caller
2450 * cannot run direct reclaim, or if the caller has realtime scheduling 2450 * cannot run direct reclaim, or if the caller has realtime scheduling
2451 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2451 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2452 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). 2452 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
2453 */ 2453 */
2454 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2454 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2455 2455
2456 if (atomic) { 2456 if (atomic) {
2457 /* 2457 /*
2458 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2458 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2459 * if it can't schedule. 2459 * if it can't schedule.
2460 */ 2460 */
2461 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2461 if (!(gfp_mask & __GFP_NOMEMALLOC))
2462 alloc_flags |= ALLOC_HARDER; 2462 alloc_flags |= ALLOC_HARDER;
2463 /* 2463 /*
2464 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2464 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2465 * comment for __cpuset_node_allowed_softwall(). 2465 * comment for __cpuset_node_allowed_softwall().
2466 */ 2466 */
2467 alloc_flags &= ~ALLOC_CPUSET; 2467 alloc_flags &= ~ALLOC_CPUSET;
2468 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2468 } else if (unlikely(rt_task(current)) && !in_interrupt())
2469 alloc_flags |= ALLOC_HARDER; 2469 alloc_flags |= ALLOC_HARDER;
2470 2470
2471 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2471 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2472 if (gfp_mask & __GFP_MEMALLOC) 2472 if (gfp_mask & __GFP_MEMALLOC)
2473 alloc_flags |= ALLOC_NO_WATERMARKS; 2473 alloc_flags |= ALLOC_NO_WATERMARKS;
2474 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2474 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2475 alloc_flags |= ALLOC_NO_WATERMARKS; 2475 alloc_flags |= ALLOC_NO_WATERMARKS;
2476 else if (!in_interrupt() && 2476 else if (!in_interrupt() &&
2477 ((current->flags & PF_MEMALLOC) || 2477 ((current->flags & PF_MEMALLOC) ||
2478 unlikely(test_thread_flag(TIF_MEMDIE)))) 2478 unlikely(test_thread_flag(TIF_MEMDIE))))
2479 alloc_flags |= ALLOC_NO_WATERMARKS; 2479 alloc_flags |= ALLOC_NO_WATERMARKS;
2480 } 2480 }
2481 #ifdef CONFIG_CMA 2481 #ifdef CONFIG_CMA
2482 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2482 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2483 alloc_flags |= ALLOC_CMA; 2483 alloc_flags |= ALLOC_CMA;
2484 #endif 2484 #endif
2485 return alloc_flags; 2485 return alloc_flags;
2486 } 2486 }
2487 2487
2488 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2488 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2489 { 2489 {
2490 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2490 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2491 } 2491 }
2492 2492
2493 static inline struct page * 2493 static inline struct page *
2494 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2494 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2495 struct zonelist *zonelist, enum zone_type high_zoneidx, 2495 struct zonelist *zonelist, enum zone_type high_zoneidx,
2496 nodemask_t *nodemask, struct zone *preferred_zone, 2496 nodemask_t *nodemask, struct zone *preferred_zone,
2497 int classzone_idx, int migratetype) 2497 int classzone_idx, int migratetype)
2498 { 2498 {
2499 const gfp_t wait = gfp_mask & __GFP_WAIT; 2499 const gfp_t wait = gfp_mask & __GFP_WAIT;
2500 struct page *page = NULL; 2500 struct page *page = NULL;
2501 int alloc_flags; 2501 int alloc_flags;
2502 unsigned long pages_reclaimed = 0; 2502 unsigned long pages_reclaimed = 0;
2503 unsigned long did_some_progress; 2503 unsigned long did_some_progress;
2504 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2504 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2505 bool deferred_compaction = false; 2505 bool deferred_compaction = false;
2506 bool contended_compaction = false; 2506 bool contended_compaction = false;
2507 2507
2508 /* 2508 /*
2509 * In the slowpath, we sanity check order to avoid ever trying to 2509 * In the slowpath, we sanity check order to avoid ever trying to
2510 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2510 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2511 * be using allocators in order of preference for an area that is 2511 * be using allocators in order of preference for an area that is
2512 * too large. 2512 * too large.
2513 */ 2513 */
2514 if (order >= MAX_ORDER) { 2514 if (order >= MAX_ORDER) {
2515 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2515 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2516 return NULL; 2516 return NULL;
2517 } 2517 }
2518 2518
2519 /* 2519 /*
2520 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2520 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2521 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2521 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2522 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2522 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2523 * using a larger set of nodes after it has established that the 2523 * using a larger set of nodes after it has established that the
2524 * allowed per node queues are empty and that nodes are 2524 * allowed per node queues are empty and that nodes are
2525 * over allocated. 2525 * over allocated.
2526 */ 2526 */
2527 if (IS_ENABLED(CONFIG_NUMA) && 2527 if (IS_ENABLED(CONFIG_NUMA) &&
2528 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2528 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2529 goto nopage; 2529 goto nopage;
2530 2530
2531 restart: 2531 restart:
2532 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2532 if (!(gfp_mask & __GFP_NO_KSWAPD))
2533 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2533 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2534 2534
2535 /* 2535 /*
2536 * OK, we're below the kswapd watermark and have kicked background 2536 * OK, we're below the kswapd watermark and have kicked background
2537 * reclaim. Now things get more complex, so set up alloc_flags according 2537 * reclaim. Now things get more complex, so set up alloc_flags according
2538 * to how we want to proceed. 2538 * to how we want to proceed.
2539 */ 2539 */
2540 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2540 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2541 2541
2542 /* 2542 /*
2543 * Find the true preferred zone if the allocation is unconstrained by 2543 * Find the true preferred zone if the allocation is unconstrained by
2544 * cpusets. 2544 * cpusets.
2545 */ 2545 */
2546 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2546 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2547 struct zoneref *preferred_zoneref; 2547 struct zoneref *preferred_zoneref;
2548 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2548 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2549 NULL, 2549 NULL,
2550 &preferred_zone); 2550 &preferred_zone);
2551 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2551 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2552 } 2552 }
2553 2553
2554 rebalance: 2554 rebalance:
2555 /* This is the last chance, in general, before the goto nopage. */ 2555 /* This is the last chance, in general, before the goto nopage. */
2556 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2556 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2557 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2557 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2558 preferred_zone, classzone_idx, migratetype); 2558 preferred_zone, classzone_idx, migratetype);
2559 if (page) 2559 if (page)
2560 goto got_pg; 2560 goto got_pg;
2561 2561
2562 /* Allocate without watermarks if the context allows */ 2562 /* Allocate without watermarks if the context allows */
2563 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2563 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2564 /* 2564 /*
2565 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2565 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2566 * the allocation is high priority and these type of 2566 * the allocation is high priority and these type of
2567 * allocations are system rather than user orientated 2567 * allocations are system rather than user orientated
2568 */ 2568 */
2569 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2569 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2570 2570
2571 page = __alloc_pages_high_priority(gfp_mask, order, 2571 page = __alloc_pages_high_priority(gfp_mask, order,
2572 zonelist, high_zoneidx, nodemask, 2572 zonelist, high_zoneidx, nodemask,
2573 preferred_zone, classzone_idx, migratetype); 2573 preferred_zone, classzone_idx, migratetype);
2574 if (page) { 2574 if (page) {
2575 goto got_pg; 2575 goto got_pg;
2576 } 2576 }
2577 } 2577 }
2578 2578
2579 /* Atomic allocations - we can't balance anything */ 2579 /* Atomic allocations - we can't balance anything */
2580 if (!wait) 2580 if (!wait)
2581 goto nopage; 2581 goto nopage;
2582 2582
2583 /* Avoid recursion of direct reclaim */ 2583 /* Avoid recursion of direct reclaim */
2584 if (current->flags & PF_MEMALLOC) 2584 if (current->flags & PF_MEMALLOC)
2585 goto nopage; 2585 goto nopage;
2586 2586
2587 /* Avoid allocations with no watermarks from looping endlessly */ 2587 /* Avoid allocations with no watermarks from looping endlessly */
2588 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2588 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2589 goto nopage; 2589 goto nopage;
2590 2590
2591 /* 2591 /*
2592 * Try direct compaction. The first pass is asynchronous. Subsequent 2592 * Try direct compaction. The first pass is asynchronous. Subsequent
2593 * attempts after direct reclaim are synchronous 2593 * attempts after direct reclaim are synchronous
2594 */ 2594 */
2595 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2595 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2596 high_zoneidx, nodemask, alloc_flags, 2596 high_zoneidx, nodemask, alloc_flags,
2597 preferred_zone, 2597 preferred_zone,
2598 classzone_idx, migratetype, 2598 classzone_idx, migratetype,
2599 migration_mode, &contended_compaction, 2599 migration_mode, &contended_compaction,
2600 &deferred_compaction, 2600 &deferred_compaction,
2601 &did_some_progress); 2601 &did_some_progress);
2602 if (page) 2602 if (page)
2603 goto got_pg; 2603 goto got_pg;
2604 migration_mode = MIGRATE_SYNC_LIGHT; 2604 migration_mode = MIGRATE_SYNC_LIGHT;
2605 2605
2606 /* 2606 /*
2607 * If compaction is deferred for high-order allocations, it is because 2607 * If compaction is deferred for high-order allocations, it is because
2608 * sync compaction recently failed. In this is the case and the caller 2608 * sync compaction recently failed. In this is the case and the caller
2609 * requested a movable allocation that does not heavily disrupt the 2609 * requested a movable allocation that does not heavily disrupt the
2610 * system then fail the allocation instead of entering direct reclaim. 2610 * system then fail the allocation instead of entering direct reclaim.
2611 */ 2611 */
2612 if ((deferred_compaction || contended_compaction) && 2612 if ((deferred_compaction || contended_compaction) &&
2613 (gfp_mask & __GFP_NO_KSWAPD)) 2613 (gfp_mask & __GFP_NO_KSWAPD))
2614 goto nopage; 2614 goto nopage;
2615 2615
2616 /* Try direct reclaim and then allocating */ 2616 /* Try direct reclaim and then allocating */
2617 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2617 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2618 zonelist, high_zoneidx, 2618 zonelist, high_zoneidx,
2619 nodemask, 2619 nodemask,
2620 alloc_flags, preferred_zone, 2620 alloc_flags, preferred_zone,
2621 classzone_idx, migratetype, 2621 classzone_idx, migratetype,
2622 &did_some_progress); 2622 &did_some_progress);
2623 if (page) 2623 if (page)
2624 goto got_pg; 2624 goto got_pg;
2625 2625
2626 /* 2626 /*
2627 * If we failed to make any progress reclaiming, then we are 2627 * If we failed to make any progress reclaiming, then we are
2628 * running out of options and have to consider going OOM 2628 * running out of options and have to consider going OOM
2629 */ 2629 */
2630 if (!did_some_progress) { 2630 if (!did_some_progress) {
2631 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2631 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2632 if (oom_killer_disabled) 2632 if (oom_killer_disabled)
2633 goto nopage; 2633 goto nopage;
2634 /* Coredumps can quickly deplete all memory reserves */ 2634 /* Coredumps can quickly deplete all memory reserves */
2635 if ((current->flags & PF_DUMPCORE) && 2635 if ((current->flags & PF_DUMPCORE) &&
2636 !(gfp_mask & __GFP_NOFAIL)) 2636 !(gfp_mask & __GFP_NOFAIL))
2637 goto nopage; 2637 goto nopage;
2638 page = __alloc_pages_may_oom(gfp_mask, order, 2638 page = __alloc_pages_may_oom(gfp_mask, order,
2639 zonelist, high_zoneidx, 2639 zonelist, high_zoneidx,
2640 nodemask, preferred_zone, 2640 nodemask, preferred_zone,
2641 classzone_idx, migratetype); 2641 classzone_idx, migratetype);
2642 if (page) 2642 if (page)
2643 goto got_pg; 2643 goto got_pg;
2644 2644
2645 if (!(gfp_mask & __GFP_NOFAIL)) { 2645 if (!(gfp_mask & __GFP_NOFAIL)) {
2646 /* 2646 /*
2647 * The oom killer is not called for high-order 2647 * The oom killer is not called for high-order
2648 * allocations that may fail, so if no progress 2648 * allocations that may fail, so if no progress
2649 * is being made, there are no other options and 2649 * is being made, there are no other options and
2650 * retrying is unlikely to help. 2650 * retrying is unlikely to help.
2651 */ 2651 */
2652 if (order > PAGE_ALLOC_COSTLY_ORDER) 2652 if (order > PAGE_ALLOC_COSTLY_ORDER)
2653 goto nopage; 2653 goto nopage;
2654 /* 2654 /*
2655 * The oom killer is not called for lowmem 2655 * The oom killer is not called for lowmem
2656 * allocations to prevent needlessly killing 2656 * allocations to prevent needlessly killing
2657 * innocent tasks. 2657 * innocent tasks.
2658 */ 2658 */
2659 if (high_zoneidx < ZONE_NORMAL) 2659 if (high_zoneidx < ZONE_NORMAL)
2660 goto nopage; 2660 goto nopage;
2661 } 2661 }
2662 2662
2663 goto restart; 2663 goto restart;
2664 } 2664 }
2665 } 2665 }
2666 2666
2667 /* Check if we should retry the allocation */ 2667 /* Check if we should retry the allocation */
2668 pages_reclaimed += did_some_progress; 2668 pages_reclaimed += did_some_progress;
2669 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2669 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2670 pages_reclaimed)) { 2670 pages_reclaimed)) {
2671 /* Wait for some write requests to complete then retry */ 2671 /* Wait for some write requests to complete then retry */
2672 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2672 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2673 goto rebalance; 2673 goto rebalance;
2674 } else { 2674 } else {
2675 /* 2675 /*
2676 * High-order allocations do not necessarily loop after 2676 * High-order allocations do not necessarily loop after
2677 * direct reclaim and reclaim/compaction depends on compaction 2677 * direct reclaim and reclaim/compaction depends on compaction
2678 * being called after reclaim so call directly if necessary 2678 * being called after reclaim so call directly if necessary
2679 */ 2679 */
2680 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2680 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2681 high_zoneidx, nodemask, alloc_flags, 2681 high_zoneidx, nodemask, alloc_flags,
2682 preferred_zone, 2682 preferred_zone,
2683 classzone_idx, migratetype, 2683 classzone_idx, migratetype,
2684 migration_mode, &contended_compaction, 2684 migration_mode, &contended_compaction,
2685 &deferred_compaction, 2685 &deferred_compaction,
2686 &did_some_progress); 2686 &did_some_progress);
2687 if (page) 2687 if (page)
2688 goto got_pg; 2688 goto got_pg;
2689 } 2689 }
2690 2690
2691 nopage: 2691 nopage:
2692 warn_alloc_failed(gfp_mask, order, NULL); 2692 warn_alloc_failed(gfp_mask, order, NULL);
2693 return page; 2693 return page;
2694 got_pg: 2694 got_pg:
2695 if (kmemcheck_enabled) 2695 if (kmemcheck_enabled)
2696 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2696 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2697 2697
2698 return page; 2698 return page;
2699 } 2699 }
2700 2700
2701 /* 2701 /*
2702 * This is the 'heart' of the zoned buddy allocator. 2702 * This is the 'heart' of the zoned buddy allocator.
2703 */ 2703 */
2704 struct page * 2704 struct page *
2705 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2705 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2706 struct zonelist *zonelist, nodemask_t *nodemask) 2706 struct zonelist *zonelist, nodemask_t *nodemask)
2707 { 2707 {
2708 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2708 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2709 struct zone *preferred_zone; 2709 struct zone *preferred_zone;
2710 struct zoneref *preferred_zoneref; 2710 struct zoneref *preferred_zoneref;
2711 struct page *page = NULL; 2711 struct page *page = NULL;
2712 int migratetype = allocflags_to_migratetype(gfp_mask); 2712 int migratetype = allocflags_to_migratetype(gfp_mask);
2713 unsigned int cpuset_mems_cookie; 2713 unsigned int cpuset_mems_cookie;
2714 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2714 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2715 struct mem_cgroup *memcg = NULL; 2715 struct mem_cgroup *memcg = NULL;
2716 int classzone_idx; 2716 int classzone_idx;
2717 2717
2718 gfp_mask &= gfp_allowed_mask; 2718 gfp_mask &= gfp_allowed_mask;
2719 2719
2720 lockdep_trace_alloc(gfp_mask); 2720 lockdep_trace_alloc(gfp_mask);
2721 2721
2722 might_sleep_if(gfp_mask & __GFP_WAIT); 2722 might_sleep_if(gfp_mask & __GFP_WAIT);
2723 2723
2724 if (should_fail_alloc_page(gfp_mask, order)) 2724 if (should_fail_alloc_page(gfp_mask, order))
2725 return NULL; 2725 return NULL;
2726 2726
2727 /* 2727 /*
2728 * Check the zones suitable for the gfp_mask contain at least one 2728 * Check the zones suitable for the gfp_mask contain at least one
2729 * valid zone. It's possible to have an empty zonelist as a result 2729 * valid zone. It's possible to have an empty zonelist as a result
2730 * of GFP_THISNODE and a memoryless node 2730 * of GFP_THISNODE and a memoryless node
2731 */ 2731 */
2732 if (unlikely(!zonelist->_zonerefs->zone)) 2732 if (unlikely(!zonelist->_zonerefs->zone))
2733 return NULL; 2733 return NULL;
2734 2734
2735 /* 2735 /*
2736 * Will only have any effect when __GFP_KMEMCG is set. This is 2736 * Will only have any effect when __GFP_KMEMCG is set. This is
2737 * verified in the (always inline) callee 2737 * verified in the (always inline) callee
2738 */ 2738 */
2739 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2739 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2740 return NULL; 2740 return NULL;
2741 2741
2742 retry_cpuset: 2742 retry_cpuset:
2743 cpuset_mems_cookie = read_mems_allowed_begin(); 2743 cpuset_mems_cookie = read_mems_allowed_begin();
2744 2744
2745 /* The preferred zone is used for statistics later */ 2745 /* The preferred zone is used for statistics later */
2746 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2746 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2747 nodemask ? : &cpuset_current_mems_allowed, 2747 nodemask ? : &cpuset_current_mems_allowed,
2748 &preferred_zone); 2748 &preferred_zone);
2749 if (!preferred_zone) 2749 if (!preferred_zone)
2750 goto out; 2750 goto out;
2751 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2751 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2752 2752
2753 #ifdef CONFIG_CMA 2753 #ifdef CONFIG_CMA
2754 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2754 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2755 alloc_flags |= ALLOC_CMA; 2755 alloc_flags |= ALLOC_CMA;
2756 #endif 2756 #endif
2757 retry: 2757 retry:
2758 /* First allocation attempt */ 2758 /* First allocation attempt */
2759 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2759 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2760 zonelist, high_zoneidx, alloc_flags, 2760 zonelist, high_zoneidx, alloc_flags,
2761 preferred_zone, classzone_idx, migratetype); 2761 preferred_zone, classzone_idx, migratetype);
2762 if (unlikely(!page)) { 2762 if (unlikely(!page)) {
2763 /* 2763 /*
2764 * The first pass makes sure allocations are spread 2764 * The first pass makes sure allocations are spread
2765 * fairly within the local node. However, the local 2765 * fairly within the local node. However, the local
2766 * node might have free pages left after the fairness 2766 * node might have free pages left after the fairness
2767 * batches are exhausted, and remote zones haven't 2767 * batches are exhausted, and remote zones haven't
2768 * even been considered yet. Try once more without 2768 * even been considered yet. Try once more without
2769 * fairness, and include remote zones now, before 2769 * fairness, and include remote zones now, before
2770 * entering the slowpath and waking kswapd: prefer 2770 * entering the slowpath and waking kswapd: prefer
2771 * spilling to a remote zone over swapping locally. 2771 * spilling to a remote zone over swapping locally.
2772 */ 2772 */
2773 if (alloc_flags & ALLOC_FAIR) { 2773 if (alloc_flags & ALLOC_FAIR) {
2774 reset_alloc_batches(zonelist, high_zoneidx, 2774 reset_alloc_batches(zonelist, high_zoneidx,
2775 preferred_zone); 2775 preferred_zone);
2776 alloc_flags &= ~ALLOC_FAIR; 2776 alloc_flags &= ~ALLOC_FAIR;
2777 goto retry; 2777 goto retry;
2778 } 2778 }
2779 /* 2779 /*
2780 * Runtime PM, block IO and its error handling path 2780 * Runtime PM, block IO and its error handling path
2781 * can deadlock because I/O on the device might not 2781 * can deadlock because I/O on the device might not
2782 * complete. 2782 * complete.
2783 */ 2783 */
2784 gfp_mask = memalloc_noio_flags(gfp_mask); 2784 gfp_mask = memalloc_noio_flags(gfp_mask);
2785 page = __alloc_pages_slowpath(gfp_mask, order, 2785 page = __alloc_pages_slowpath(gfp_mask, order,
2786 zonelist, high_zoneidx, nodemask, 2786 zonelist, high_zoneidx, nodemask,
2787 preferred_zone, classzone_idx, migratetype); 2787 preferred_zone, classzone_idx, migratetype);
2788 } 2788 }
2789 2789
2790 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2790 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2791 2791
2792 out: 2792 out:
2793 /* 2793 /*
2794 * When updating a task's mems_allowed, it is possible to race with 2794 * When updating a task's mems_allowed, it is possible to race with
2795 * parallel threads in such a way that an allocation can fail while 2795 * parallel threads in such a way that an allocation can fail while
2796 * the mask is being updated. If a page allocation is about to fail, 2796 * the mask is being updated. If a page allocation is about to fail,
2797 * check if the cpuset changed during allocation and if so, retry. 2797 * check if the cpuset changed during allocation and if so, retry.
2798 */ 2798 */
2799 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2799 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2800 goto retry_cpuset; 2800 goto retry_cpuset;
2801 2801
2802 memcg_kmem_commit_charge(page, memcg, order); 2802 memcg_kmem_commit_charge(page, memcg, order);
2803 2803
2804 return page; 2804 return page;
2805 } 2805 }
2806 EXPORT_SYMBOL(__alloc_pages_nodemask); 2806 EXPORT_SYMBOL(__alloc_pages_nodemask);
2807 2807
2808 /* 2808 /*
2809 * Common helper functions. 2809 * Common helper functions.
2810 */ 2810 */
2811 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2811 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2812 { 2812 {
2813 struct page *page; 2813 struct page *page;
2814 2814
2815 /* 2815 /*
2816 * __get_free_pages() returns a 32-bit address, which cannot represent 2816 * __get_free_pages() returns a 32-bit address, which cannot represent
2817 * a highmem page 2817 * a highmem page
2818 */ 2818 */
2819 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2819 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2820 2820
2821 page = alloc_pages(gfp_mask, order); 2821 page = alloc_pages(gfp_mask, order);
2822 if (!page) 2822 if (!page)
2823 return 0; 2823 return 0;
2824 return (unsigned long) page_address(page); 2824 return (unsigned long) page_address(page);
2825 } 2825 }
2826 EXPORT_SYMBOL(__get_free_pages); 2826 EXPORT_SYMBOL(__get_free_pages);
2827 2827
2828 unsigned long get_zeroed_page(gfp_t gfp_mask) 2828 unsigned long get_zeroed_page(gfp_t gfp_mask)
2829 { 2829 {
2830 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2830 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2831 } 2831 }
2832 EXPORT_SYMBOL(get_zeroed_page); 2832 EXPORT_SYMBOL(get_zeroed_page);
2833 2833
2834 void __free_pages(struct page *page, unsigned int order) 2834 void __free_pages(struct page *page, unsigned int order)
2835 { 2835 {
2836 if (put_page_testzero(page)) { 2836 if (put_page_testzero(page)) {
2837 if (order == 0) 2837 if (order == 0)
2838 free_hot_cold_page(page, false); 2838 free_hot_cold_page(page, false);
2839 else 2839 else
2840 __free_pages_ok(page, order); 2840 __free_pages_ok(page, order);
2841 } 2841 }
2842 } 2842 }
2843 2843
2844 EXPORT_SYMBOL(__free_pages); 2844 EXPORT_SYMBOL(__free_pages);
2845 2845
2846 void free_pages(unsigned long addr, unsigned int order) 2846 void free_pages(unsigned long addr, unsigned int order)
2847 { 2847 {
2848 if (addr != 0) { 2848 if (addr != 0) {
2849 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2849 VM_BUG_ON(!virt_addr_valid((void *)addr));
2850 __free_pages(virt_to_page((void *)addr), order); 2850 __free_pages(virt_to_page((void *)addr), order);
2851 } 2851 }
2852 } 2852 }
2853 2853
2854 EXPORT_SYMBOL(free_pages); 2854 EXPORT_SYMBOL(free_pages);
2855 2855
2856 /* 2856 /*
2857 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2857 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2858 * pages allocated with __GFP_KMEMCG. 2858 * pages allocated with __GFP_KMEMCG.
2859 * 2859 *
2860 * Those pages are accounted to a particular memcg, embedded in the 2860 * Those pages are accounted to a particular memcg, embedded in the
2861 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2861 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2862 * for that information only to find out that it is NULL for users who have no 2862 * for that information only to find out that it is NULL for users who have no
2863 * interest in that whatsoever, we provide these functions. 2863 * interest in that whatsoever, we provide these functions.
2864 * 2864 *
2865 * The caller knows better which flags it relies on. 2865 * The caller knows better which flags it relies on.
2866 */ 2866 */
2867 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2867 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2868 { 2868 {
2869 memcg_kmem_uncharge_pages(page, order); 2869 memcg_kmem_uncharge_pages(page, order);
2870 __free_pages(page, order); 2870 __free_pages(page, order);
2871 } 2871 }
2872 2872
2873 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2873 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2874 { 2874 {
2875 if (addr != 0) { 2875 if (addr != 0) {
2876 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2876 VM_BUG_ON(!virt_addr_valid((void *)addr));
2877 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2877 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2878 } 2878 }
2879 } 2879 }
2880 2880
2881 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2881 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2882 { 2882 {
2883 if (addr) { 2883 if (addr) {
2884 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2884 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2885 unsigned long used = addr + PAGE_ALIGN(size); 2885 unsigned long used = addr + PAGE_ALIGN(size);
2886 2886
2887 split_page(virt_to_page((void *)addr), order); 2887 split_page(virt_to_page((void *)addr), order);
2888 while (used < alloc_end) { 2888 while (used < alloc_end) {
2889 free_page(used); 2889 free_page(used);
2890 used += PAGE_SIZE; 2890 used += PAGE_SIZE;
2891 } 2891 }
2892 } 2892 }
2893 return (void *)addr; 2893 return (void *)addr;
2894 } 2894 }
2895 2895
2896 /** 2896 /**
2897 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2897 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2898 * @size: the number of bytes to allocate 2898 * @size: the number of bytes to allocate
2899 * @gfp_mask: GFP flags for the allocation 2899 * @gfp_mask: GFP flags for the allocation
2900 * 2900 *
2901 * This function is similar to alloc_pages(), except that it allocates the 2901 * This function is similar to alloc_pages(), except that it allocates the
2902 * minimum number of pages to satisfy the request. alloc_pages() can only 2902 * minimum number of pages to satisfy the request. alloc_pages() can only
2903 * allocate memory in power-of-two pages. 2903 * allocate memory in power-of-two pages.
2904 * 2904 *
2905 * This function is also limited by MAX_ORDER. 2905 * This function is also limited by MAX_ORDER.
2906 * 2906 *
2907 * Memory allocated by this function must be released by free_pages_exact(). 2907 * Memory allocated by this function must be released by free_pages_exact().
2908 */ 2908 */
2909 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2909 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2910 { 2910 {
2911 unsigned int order = get_order(size); 2911 unsigned int order = get_order(size);
2912 unsigned long addr; 2912 unsigned long addr;
2913 2913
2914 addr = __get_free_pages(gfp_mask, order); 2914 addr = __get_free_pages(gfp_mask, order);
2915 return make_alloc_exact(addr, order, size); 2915 return make_alloc_exact(addr, order, size);
2916 } 2916 }
2917 EXPORT_SYMBOL(alloc_pages_exact); 2917 EXPORT_SYMBOL(alloc_pages_exact);
2918 2918
2919 /** 2919 /**
2920 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2920 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2921 * pages on a node. 2921 * pages on a node.
2922 * @nid: the preferred node ID where memory should be allocated 2922 * @nid: the preferred node ID where memory should be allocated
2923 * @size: the number of bytes to allocate 2923 * @size: the number of bytes to allocate
2924 * @gfp_mask: GFP flags for the allocation 2924 * @gfp_mask: GFP flags for the allocation
2925 * 2925 *
2926 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2926 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2927 * back. 2927 * back.
2928 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2928 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2929 * but is not exact. 2929 * but is not exact.
2930 */ 2930 */
2931 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2931 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2932 { 2932 {
2933 unsigned order = get_order(size); 2933 unsigned order = get_order(size);
2934 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2934 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2935 if (!p) 2935 if (!p)
2936 return NULL; 2936 return NULL;
2937 return make_alloc_exact((unsigned long)page_address(p), order, size); 2937 return make_alloc_exact((unsigned long)page_address(p), order, size);
2938 } 2938 }
2939 EXPORT_SYMBOL(alloc_pages_exact_nid); 2939 EXPORT_SYMBOL(alloc_pages_exact_nid);
2940 2940
2941 /** 2941 /**
2942 * free_pages_exact - release memory allocated via alloc_pages_exact() 2942 * free_pages_exact - release memory allocated via alloc_pages_exact()
2943 * @virt: the value returned by alloc_pages_exact. 2943 * @virt: the value returned by alloc_pages_exact.
2944 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2944 * @size: size of allocation, same value as passed to alloc_pages_exact().
2945 * 2945 *
2946 * Release the memory allocated by a previous call to alloc_pages_exact. 2946 * Release the memory allocated by a previous call to alloc_pages_exact.
2947 */ 2947 */
2948 void free_pages_exact(void *virt, size_t size) 2948 void free_pages_exact(void *virt, size_t size)
2949 { 2949 {
2950 unsigned long addr = (unsigned long)virt; 2950 unsigned long addr = (unsigned long)virt;
2951 unsigned long end = addr + PAGE_ALIGN(size); 2951 unsigned long end = addr + PAGE_ALIGN(size);
2952 2952
2953 while (addr < end) { 2953 while (addr < end) {
2954 free_page(addr); 2954 free_page(addr);
2955 addr += PAGE_SIZE; 2955 addr += PAGE_SIZE;
2956 } 2956 }
2957 } 2957 }
2958 EXPORT_SYMBOL(free_pages_exact); 2958 EXPORT_SYMBOL(free_pages_exact);
2959 2959
2960 /** 2960 /**
2961 * nr_free_zone_pages - count number of pages beyond high watermark 2961 * nr_free_zone_pages - count number of pages beyond high watermark
2962 * @offset: The zone index of the highest zone 2962 * @offset: The zone index of the highest zone
2963 * 2963 *
2964 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2964 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2965 * high watermark within all zones at or below a given zone index. For each 2965 * high watermark within all zones at or below a given zone index. For each
2966 * zone, the number of pages is calculated as: 2966 * zone, the number of pages is calculated as:
2967 * managed_pages - high_pages 2967 * managed_pages - high_pages
2968 */ 2968 */
2969 static unsigned long nr_free_zone_pages(int offset) 2969 static unsigned long nr_free_zone_pages(int offset)
2970 { 2970 {
2971 struct zoneref *z; 2971 struct zoneref *z;
2972 struct zone *zone; 2972 struct zone *zone;
2973 2973
2974 /* Just pick one node, since fallback list is circular */ 2974 /* Just pick one node, since fallback list is circular */
2975 unsigned long sum = 0; 2975 unsigned long sum = 0;
2976 2976
2977 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2977 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2978 2978
2979 for_each_zone_zonelist(zone, z, zonelist, offset) { 2979 for_each_zone_zonelist(zone, z, zonelist, offset) {
2980 unsigned long size = zone->managed_pages; 2980 unsigned long size = zone->managed_pages;
2981 unsigned long high = high_wmark_pages(zone); 2981 unsigned long high = high_wmark_pages(zone);
2982 if (size > high) 2982 if (size > high)
2983 sum += size - high; 2983 sum += size - high;
2984 } 2984 }
2985 2985
2986 return sum; 2986 return sum;
2987 } 2987 }
2988 2988
2989 /** 2989 /**
2990 * nr_free_buffer_pages - count number of pages beyond high watermark 2990 * nr_free_buffer_pages - count number of pages beyond high watermark
2991 * 2991 *
2992 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2992 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2993 * watermark within ZONE_DMA and ZONE_NORMAL. 2993 * watermark within ZONE_DMA and ZONE_NORMAL.
2994 */ 2994 */
2995 unsigned long nr_free_buffer_pages(void) 2995 unsigned long nr_free_buffer_pages(void)
2996 { 2996 {
2997 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2997 return nr_free_zone_pages(gfp_zone(GFP_USER));
2998 } 2998 }
2999 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2999 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
3000 3000
3001 /** 3001 /**
3002 * nr_free_pagecache_pages - count number of pages beyond high watermark 3002 * nr_free_pagecache_pages - count number of pages beyond high watermark
3003 * 3003 *
3004 * nr_free_pagecache_pages() counts the number of pages which are beyond the 3004 * nr_free_pagecache_pages() counts the number of pages which are beyond the
3005 * high watermark within all zones. 3005 * high watermark within all zones.
3006 */ 3006 */
3007 unsigned long nr_free_pagecache_pages(void) 3007 unsigned long nr_free_pagecache_pages(void)
3008 { 3008 {
3009 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 3009 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
3010 } 3010 }
3011 3011
3012 static inline void show_node(struct zone *zone) 3012 static inline void show_node(struct zone *zone)
3013 { 3013 {
3014 if (IS_ENABLED(CONFIG_NUMA)) 3014 if (IS_ENABLED(CONFIG_NUMA))
3015 printk("Node %d ", zone_to_nid(zone)); 3015 printk("Node %d ", zone_to_nid(zone));
3016 } 3016 }
3017 3017
3018 void si_meminfo(struct sysinfo *val) 3018 void si_meminfo(struct sysinfo *val)
3019 { 3019 {
3020 val->totalram = totalram_pages; 3020 val->totalram = totalram_pages;
3021 val->sharedram = 0; 3021 val->sharedram = 0;
3022 val->freeram = global_page_state(NR_FREE_PAGES); 3022 val->freeram = global_page_state(NR_FREE_PAGES);
3023 val->bufferram = nr_blockdev_pages(); 3023 val->bufferram = nr_blockdev_pages();
3024 val->totalhigh = totalhigh_pages; 3024 val->totalhigh = totalhigh_pages;
3025 val->freehigh = nr_free_highpages(); 3025 val->freehigh = nr_free_highpages();
3026 val->mem_unit = PAGE_SIZE; 3026 val->mem_unit = PAGE_SIZE;
3027 } 3027 }
3028 3028
3029 EXPORT_SYMBOL(si_meminfo); 3029 EXPORT_SYMBOL(si_meminfo);
3030 3030
3031 #ifdef CONFIG_NUMA 3031 #ifdef CONFIG_NUMA
3032 void si_meminfo_node(struct sysinfo *val, int nid) 3032 void si_meminfo_node(struct sysinfo *val, int nid)
3033 { 3033 {
3034 int zone_type; /* needs to be signed */ 3034 int zone_type; /* needs to be signed */
3035 unsigned long managed_pages = 0; 3035 unsigned long managed_pages = 0;
3036 pg_data_t *pgdat = NODE_DATA(nid); 3036 pg_data_t *pgdat = NODE_DATA(nid);
3037 3037
3038 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3038 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3039 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3039 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3040 val->totalram = managed_pages; 3040 val->totalram = managed_pages;
3041 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3041 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3042 #ifdef CONFIG_HIGHMEM 3042 #ifdef CONFIG_HIGHMEM
3043 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3043 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
3044 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3044 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3045 NR_FREE_PAGES); 3045 NR_FREE_PAGES);
3046 #else 3046 #else
3047 val->totalhigh = 0; 3047 val->totalhigh = 0;
3048 val->freehigh = 0; 3048 val->freehigh = 0;
3049 #endif 3049 #endif
3050 val->mem_unit = PAGE_SIZE; 3050 val->mem_unit = PAGE_SIZE;
3051 } 3051 }
3052 #endif 3052 #endif
3053 3053
3054 /* 3054 /*
3055 * Determine whether the node should be displayed or not, depending on whether 3055 * Determine whether the node should be displayed or not, depending on whether
3056 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3056 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3057 */ 3057 */
3058 bool skip_free_areas_node(unsigned int flags, int nid) 3058 bool skip_free_areas_node(unsigned int flags, int nid)
3059 { 3059 {
3060 bool ret = false; 3060 bool ret = false;
3061 unsigned int cpuset_mems_cookie; 3061 unsigned int cpuset_mems_cookie;
3062 3062
3063 if (!(flags & SHOW_MEM_FILTER_NODES)) 3063 if (!(flags & SHOW_MEM_FILTER_NODES))
3064 goto out; 3064 goto out;
3065 3065
3066 do { 3066 do {
3067 cpuset_mems_cookie = read_mems_allowed_begin(); 3067 cpuset_mems_cookie = read_mems_allowed_begin();
3068 ret = !node_isset(nid, cpuset_current_mems_allowed); 3068 ret = !node_isset(nid, cpuset_current_mems_allowed);
3069 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3069 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3070 out: 3070 out:
3071 return ret; 3071 return ret;
3072 } 3072 }
3073 3073
3074 #define K(x) ((x) << (PAGE_SHIFT-10)) 3074 #define K(x) ((x) << (PAGE_SHIFT-10))
3075 3075
3076 static void show_migration_types(unsigned char type) 3076 static void show_migration_types(unsigned char type)
3077 { 3077 {
3078 static const char types[MIGRATE_TYPES] = { 3078 static const char types[MIGRATE_TYPES] = {
3079 [MIGRATE_UNMOVABLE] = 'U', 3079 [MIGRATE_UNMOVABLE] = 'U',
3080 [MIGRATE_RECLAIMABLE] = 'E', 3080 [MIGRATE_RECLAIMABLE] = 'E',
3081 [MIGRATE_MOVABLE] = 'M', 3081 [MIGRATE_MOVABLE] = 'M',
3082 [MIGRATE_RESERVE] = 'R', 3082 [MIGRATE_RESERVE] = 'R',
3083 #ifdef CONFIG_CMA 3083 #ifdef CONFIG_CMA
3084 [MIGRATE_CMA] = 'C', 3084 [MIGRATE_CMA] = 'C',
3085 #endif 3085 #endif
3086 #ifdef CONFIG_MEMORY_ISOLATION 3086 #ifdef CONFIG_MEMORY_ISOLATION
3087 [MIGRATE_ISOLATE] = 'I', 3087 [MIGRATE_ISOLATE] = 'I',
3088 #endif 3088 #endif
3089 }; 3089 };
3090 char tmp[MIGRATE_TYPES + 1]; 3090 char tmp[MIGRATE_TYPES + 1];
3091 char *p = tmp; 3091 char *p = tmp;
3092 int i; 3092 int i;
3093 3093
3094 for (i = 0; i < MIGRATE_TYPES; i++) { 3094 for (i = 0; i < MIGRATE_TYPES; i++) {
3095 if (type & (1 << i)) 3095 if (type & (1 << i))
3096 *p++ = types[i]; 3096 *p++ = types[i];
3097 } 3097 }
3098 3098
3099 *p = '\0'; 3099 *p = '\0';
3100 printk("(%s) ", tmp); 3100 printk("(%s) ", tmp);
3101 } 3101 }
3102 3102
3103 /* 3103 /*
3104 * Show free area list (used inside shift_scroll-lock stuff) 3104 * Show free area list (used inside shift_scroll-lock stuff)
3105 * We also calculate the percentage fragmentation. We do this by counting the 3105 * We also calculate the percentage fragmentation. We do this by counting the
3106 * memory on each free list with the exception of the first item on the list. 3106 * memory on each free list with the exception of the first item on the list.
3107 * Suppresses nodes that are not allowed by current's cpuset if 3107 * Suppresses nodes that are not allowed by current's cpuset if
3108 * SHOW_MEM_FILTER_NODES is passed. 3108 * SHOW_MEM_FILTER_NODES is passed.
3109 */ 3109 */
3110 void show_free_areas(unsigned int filter) 3110 void show_free_areas(unsigned int filter)
3111 { 3111 {
3112 int cpu; 3112 int cpu;
3113 struct zone *zone; 3113 struct zone *zone;
3114 3114
3115 for_each_populated_zone(zone) { 3115 for_each_populated_zone(zone) {
3116 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3116 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3117 continue; 3117 continue;
3118 show_node(zone); 3118 show_node(zone);
3119 printk("%s per-cpu:\n", zone->name); 3119 printk("%s per-cpu:\n", zone->name);
3120 3120
3121 for_each_online_cpu(cpu) { 3121 for_each_online_cpu(cpu) {
3122 struct per_cpu_pageset *pageset; 3122 struct per_cpu_pageset *pageset;
3123 3123
3124 pageset = per_cpu_ptr(zone->pageset, cpu); 3124 pageset = per_cpu_ptr(zone->pageset, cpu);
3125 3125
3126 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3126 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3127 cpu, pageset->pcp.high, 3127 cpu, pageset->pcp.high,
3128 pageset->pcp.batch, pageset->pcp.count); 3128 pageset->pcp.batch, pageset->pcp.count);
3129 } 3129 }
3130 } 3130 }
3131 3131
3132 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3132 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3133 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3133 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3134 " unevictable:%lu" 3134 " unevictable:%lu"
3135 " dirty:%lu writeback:%lu unstable:%lu\n" 3135 " dirty:%lu writeback:%lu unstable:%lu\n"
3136 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3136 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3137 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3137 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3138 " free_cma:%lu\n", 3138 " free_cma:%lu\n",
3139 global_page_state(NR_ACTIVE_ANON), 3139 global_page_state(NR_ACTIVE_ANON),
3140 global_page_state(NR_INACTIVE_ANON), 3140 global_page_state(NR_INACTIVE_ANON),
3141 global_page_state(NR_ISOLATED_ANON), 3141 global_page_state(NR_ISOLATED_ANON),
3142 global_page_state(NR_ACTIVE_FILE), 3142 global_page_state(NR_ACTIVE_FILE),
3143 global_page_state(NR_INACTIVE_FILE), 3143 global_page_state(NR_INACTIVE_FILE),
3144 global_page_state(NR_ISOLATED_FILE), 3144 global_page_state(NR_ISOLATED_FILE),
3145 global_page_state(NR_UNEVICTABLE), 3145 global_page_state(NR_UNEVICTABLE),
3146 global_page_state(NR_FILE_DIRTY), 3146 global_page_state(NR_FILE_DIRTY),
3147 global_page_state(NR_WRITEBACK), 3147 global_page_state(NR_WRITEBACK),
3148 global_page_state(NR_UNSTABLE_NFS), 3148 global_page_state(NR_UNSTABLE_NFS),
3149 global_page_state(NR_FREE_PAGES), 3149 global_page_state(NR_FREE_PAGES),
3150 global_page_state(NR_SLAB_RECLAIMABLE), 3150 global_page_state(NR_SLAB_RECLAIMABLE),
3151 global_page_state(NR_SLAB_UNRECLAIMABLE), 3151 global_page_state(NR_SLAB_UNRECLAIMABLE),
3152 global_page_state(NR_FILE_MAPPED), 3152 global_page_state(NR_FILE_MAPPED),
3153 global_page_state(NR_SHMEM), 3153 global_page_state(NR_SHMEM),
3154 global_page_state(NR_PAGETABLE), 3154 global_page_state(NR_PAGETABLE),
3155 global_page_state(NR_BOUNCE), 3155 global_page_state(NR_BOUNCE),
3156 global_page_state(NR_FREE_CMA_PAGES)); 3156 global_page_state(NR_FREE_CMA_PAGES));
3157 3157
3158 for_each_populated_zone(zone) { 3158 for_each_populated_zone(zone) {
3159 int i; 3159 int i;
3160 3160
3161 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3161 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3162 continue; 3162 continue;
3163 show_node(zone); 3163 show_node(zone);
3164 printk("%s" 3164 printk("%s"
3165 " free:%lukB" 3165 " free:%lukB"
3166 " min:%lukB" 3166 " min:%lukB"
3167 " low:%lukB" 3167 " low:%lukB"
3168 " high:%lukB" 3168 " high:%lukB"
3169 " active_anon:%lukB" 3169 " active_anon:%lukB"
3170 " inactive_anon:%lukB" 3170 " inactive_anon:%lukB"
3171 " active_file:%lukB" 3171 " active_file:%lukB"
3172 " inactive_file:%lukB" 3172 " inactive_file:%lukB"
3173 " unevictable:%lukB" 3173 " unevictable:%lukB"
3174 " isolated(anon):%lukB" 3174 " isolated(anon):%lukB"
3175 " isolated(file):%lukB" 3175 " isolated(file):%lukB"
3176 " present:%lukB" 3176 " present:%lukB"
3177 " managed:%lukB" 3177 " managed:%lukB"
3178 " mlocked:%lukB" 3178 " mlocked:%lukB"
3179 " dirty:%lukB" 3179 " dirty:%lukB"
3180 " writeback:%lukB" 3180 " writeback:%lukB"
3181 " mapped:%lukB" 3181 " mapped:%lukB"
3182 " shmem:%lukB" 3182 " shmem:%lukB"
3183 " slab_reclaimable:%lukB" 3183 " slab_reclaimable:%lukB"
3184 " slab_unreclaimable:%lukB" 3184 " slab_unreclaimable:%lukB"
3185 " kernel_stack:%lukB" 3185 " kernel_stack:%lukB"
3186 " pagetables:%lukB" 3186 " pagetables:%lukB"
3187 " unstable:%lukB" 3187 " unstable:%lukB"
3188 " bounce:%lukB" 3188 " bounce:%lukB"
3189 " free_cma:%lukB" 3189 " free_cma:%lukB"
3190 " writeback_tmp:%lukB" 3190 " writeback_tmp:%lukB"
3191 " pages_scanned:%lu" 3191 " pages_scanned:%lu"
3192 " all_unreclaimable? %s" 3192 " all_unreclaimable? %s"
3193 "\n", 3193 "\n",
3194 zone->name, 3194 zone->name,
3195 K(zone_page_state(zone, NR_FREE_PAGES)), 3195 K(zone_page_state(zone, NR_FREE_PAGES)),
3196 K(min_wmark_pages(zone)), 3196 K(min_wmark_pages(zone)),
3197 K(low_wmark_pages(zone)), 3197 K(low_wmark_pages(zone)),
3198 K(high_wmark_pages(zone)), 3198 K(high_wmark_pages(zone)),
3199 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3199 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3200 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3200 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3201 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3201 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3202 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3202 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3203 K(zone_page_state(zone, NR_UNEVICTABLE)), 3203 K(zone_page_state(zone, NR_UNEVICTABLE)),
3204 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3204 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3205 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3205 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3206 K(zone->present_pages), 3206 K(zone->present_pages),
3207 K(zone->managed_pages), 3207 K(zone->managed_pages),
3208 K(zone_page_state(zone, NR_MLOCK)), 3208 K(zone_page_state(zone, NR_MLOCK)),
3209 K(zone_page_state(zone, NR_FILE_DIRTY)), 3209 K(zone_page_state(zone, NR_FILE_DIRTY)),
3210 K(zone_page_state(zone, NR_WRITEBACK)), 3210 K(zone_page_state(zone, NR_WRITEBACK)),
3211 K(zone_page_state(zone, NR_FILE_MAPPED)), 3211 K(zone_page_state(zone, NR_FILE_MAPPED)),
3212 K(zone_page_state(zone, NR_SHMEM)), 3212 K(zone_page_state(zone, NR_SHMEM)),
3213 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3213 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3214 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3214 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3215 zone_page_state(zone, NR_KERNEL_STACK) * 3215 zone_page_state(zone, NR_KERNEL_STACK) *
3216 THREAD_SIZE / 1024, 3216 THREAD_SIZE / 1024,
3217 K(zone_page_state(zone, NR_PAGETABLE)), 3217 K(zone_page_state(zone, NR_PAGETABLE)),
3218 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3218 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3219 K(zone_page_state(zone, NR_BOUNCE)), 3219 K(zone_page_state(zone, NR_BOUNCE)),
3220 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3220 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3221 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3221 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3222 zone->pages_scanned, 3222 zone->pages_scanned,
3223 (!zone_reclaimable(zone) ? "yes" : "no") 3223 (!zone_reclaimable(zone) ? "yes" : "no")
3224 ); 3224 );
3225 printk("lowmem_reserve[]:"); 3225 printk("lowmem_reserve[]:");
3226 for (i = 0; i < MAX_NR_ZONES; i++) 3226 for (i = 0; i < MAX_NR_ZONES; i++)
3227 printk(" %lu", zone->lowmem_reserve[i]); 3227 printk(" %lu", zone->lowmem_reserve[i]);
3228 printk("\n"); 3228 printk("\n");
3229 } 3229 }
3230 3230
3231 for_each_populated_zone(zone) { 3231 for_each_populated_zone(zone) {
3232 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3232 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3233 unsigned char types[MAX_ORDER]; 3233 unsigned char types[MAX_ORDER];
3234 3234
3235 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3235 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3236 continue; 3236 continue;
3237 show_node(zone); 3237 show_node(zone);
3238 printk("%s: ", zone->name); 3238 printk("%s: ", zone->name);
3239 3239
3240 spin_lock_irqsave(&zone->lock, flags); 3240 spin_lock_irqsave(&zone->lock, flags);
3241 for (order = 0; order < MAX_ORDER; order++) { 3241 for (order = 0; order < MAX_ORDER; order++) {
3242 struct free_area *area = &zone->free_area[order]; 3242 struct free_area *area = &zone->free_area[order];
3243 int type; 3243 int type;
3244 3244
3245 nr[order] = area->nr_free; 3245 nr[order] = area->nr_free;
3246 total += nr[order] << order; 3246 total += nr[order] << order;
3247 3247
3248 types[order] = 0; 3248 types[order] = 0;
3249 for (type = 0; type < MIGRATE_TYPES; type++) { 3249 for (type = 0; type < MIGRATE_TYPES; type++) {
3250 if (!list_empty(&area->free_list[type])) 3250 if (!list_empty(&area->free_list[type]))
3251 types[order] |= 1 << type; 3251 types[order] |= 1 << type;
3252 } 3252 }
3253 } 3253 }
3254 spin_unlock_irqrestore(&zone->lock, flags); 3254 spin_unlock_irqrestore(&zone->lock, flags);
3255 for (order = 0; order < MAX_ORDER; order++) { 3255 for (order = 0; order < MAX_ORDER; order++) {
3256 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3256 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3257 if (nr[order]) 3257 if (nr[order])
3258 show_migration_types(types[order]); 3258 show_migration_types(types[order]);
3259 } 3259 }
3260 printk("= %lukB\n", K(total)); 3260 printk("= %lukB\n", K(total));
3261 } 3261 }
3262 3262
3263 hugetlb_show_meminfo(); 3263 hugetlb_show_meminfo();
3264 3264
3265 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3265 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3266 3266
3267 show_swap_cache_info(); 3267 show_swap_cache_info();
3268 } 3268 }
3269 3269
3270 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3270 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3271 { 3271 {
3272 zoneref->zone = zone; 3272 zoneref->zone = zone;
3273 zoneref->zone_idx = zone_idx(zone); 3273 zoneref->zone_idx = zone_idx(zone);
3274 } 3274 }
3275 3275
3276 /* 3276 /*
3277 * Builds allocation fallback zone lists. 3277 * Builds allocation fallback zone lists.
3278 * 3278 *
3279 * Add all populated zones of a node to the zonelist. 3279 * Add all populated zones of a node to the zonelist.
3280 */ 3280 */
3281 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3281 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3282 int nr_zones) 3282 int nr_zones)
3283 { 3283 {
3284 struct zone *zone; 3284 struct zone *zone;
3285 enum zone_type zone_type = MAX_NR_ZONES; 3285 enum zone_type zone_type = MAX_NR_ZONES;
3286 3286
3287 do { 3287 do {
3288 zone_type--; 3288 zone_type--;
3289 zone = pgdat->node_zones + zone_type; 3289 zone = pgdat->node_zones + zone_type;
3290 if (populated_zone(zone)) { 3290 if (populated_zone(zone)) {
3291 zoneref_set_zone(zone, 3291 zoneref_set_zone(zone,
3292 &zonelist->_zonerefs[nr_zones++]); 3292 &zonelist->_zonerefs[nr_zones++]);
3293 check_highest_zone(zone_type); 3293 check_highest_zone(zone_type);
3294 } 3294 }
3295 } while (zone_type); 3295 } while (zone_type);
3296 3296
3297 return nr_zones; 3297 return nr_zones;
3298 } 3298 }
3299 3299
3300 3300
3301 /* 3301 /*
3302 * zonelist_order: 3302 * zonelist_order:
3303 * 0 = automatic detection of better ordering. 3303 * 0 = automatic detection of better ordering.
3304 * 1 = order by ([node] distance, -zonetype) 3304 * 1 = order by ([node] distance, -zonetype)
3305 * 2 = order by (-zonetype, [node] distance) 3305 * 2 = order by (-zonetype, [node] distance)
3306 * 3306 *
3307 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3307 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3308 * the same zonelist. So only NUMA can configure this param. 3308 * the same zonelist. So only NUMA can configure this param.
3309 */ 3309 */
3310 #define ZONELIST_ORDER_DEFAULT 0 3310 #define ZONELIST_ORDER_DEFAULT 0
3311 #define ZONELIST_ORDER_NODE 1 3311 #define ZONELIST_ORDER_NODE 1
3312 #define ZONELIST_ORDER_ZONE 2 3312 #define ZONELIST_ORDER_ZONE 2
3313 3313
3314 /* zonelist order in the kernel. 3314 /* zonelist order in the kernel.
3315 * set_zonelist_order() will set this to NODE or ZONE. 3315 * set_zonelist_order() will set this to NODE or ZONE.
3316 */ 3316 */
3317 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3317 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3318 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3318 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3319 3319
3320 3320
3321 #ifdef CONFIG_NUMA 3321 #ifdef CONFIG_NUMA
3322 /* The value user specified ....changed by config */ 3322 /* The value user specified ....changed by config */
3323 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3323 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3324 /* string for sysctl */ 3324 /* string for sysctl */
3325 #define NUMA_ZONELIST_ORDER_LEN 16 3325 #define NUMA_ZONELIST_ORDER_LEN 16
3326 char numa_zonelist_order[16] = "default"; 3326 char numa_zonelist_order[16] = "default";
3327 3327
3328 /* 3328 /*
3329 * interface for configure zonelist ordering. 3329 * interface for configure zonelist ordering.
3330 * command line option "numa_zonelist_order" 3330 * command line option "numa_zonelist_order"
3331 * = "[dD]efault - default, automatic configuration. 3331 * = "[dD]efault - default, automatic configuration.
3332 * = "[nN]ode - order by node locality, then by zone within node 3332 * = "[nN]ode - order by node locality, then by zone within node
3333 * = "[zZ]one - order by zone, then by locality within zone 3333 * = "[zZ]one - order by zone, then by locality within zone
3334 */ 3334 */
3335 3335
3336 static int __parse_numa_zonelist_order(char *s) 3336 static int __parse_numa_zonelist_order(char *s)
3337 { 3337 {
3338 if (*s == 'd' || *s == 'D') { 3338 if (*s == 'd' || *s == 'D') {
3339 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3339 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3340 } else if (*s == 'n' || *s == 'N') { 3340 } else if (*s == 'n' || *s == 'N') {
3341 user_zonelist_order = ZONELIST_ORDER_NODE; 3341 user_zonelist_order = ZONELIST_ORDER_NODE;
3342 } else if (*s == 'z' || *s == 'Z') { 3342 } else if (*s == 'z' || *s == 'Z') {
3343 user_zonelist_order = ZONELIST_ORDER_ZONE; 3343 user_zonelist_order = ZONELIST_ORDER_ZONE;
3344 } else { 3344 } else {
3345 printk(KERN_WARNING 3345 printk(KERN_WARNING
3346 "Ignoring invalid numa_zonelist_order value: " 3346 "Ignoring invalid numa_zonelist_order value: "
3347 "%s\n", s); 3347 "%s\n", s);
3348 return -EINVAL; 3348 return -EINVAL;
3349 } 3349 }
3350 return 0; 3350 return 0;
3351 } 3351 }
3352 3352
3353 static __init int setup_numa_zonelist_order(char *s) 3353 static __init int setup_numa_zonelist_order(char *s)
3354 { 3354 {
3355 int ret; 3355 int ret;
3356 3356
3357 if (!s) 3357 if (!s)
3358 return 0; 3358 return 0;
3359 3359
3360 ret = __parse_numa_zonelist_order(s); 3360 ret = __parse_numa_zonelist_order(s);
3361 if (ret == 0) 3361 if (ret == 0)
3362 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3362 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3363 3363
3364 return ret; 3364 return ret;
3365 } 3365 }
3366 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3366 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3367 3367
3368 /* 3368 /*
3369 * sysctl handler for numa_zonelist_order 3369 * sysctl handler for numa_zonelist_order
3370 */ 3370 */
3371 int numa_zonelist_order_handler(ctl_table *table, int write, 3371 int numa_zonelist_order_handler(ctl_table *table, int write,
3372 void __user *buffer, size_t *length, 3372 void __user *buffer, size_t *length,
3373 loff_t *ppos) 3373 loff_t *ppos)
3374 { 3374 {
3375 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3375 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3376 int ret; 3376 int ret;
3377 static DEFINE_MUTEX(zl_order_mutex); 3377 static DEFINE_MUTEX(zl_order_mutex);
3378 3378
3379 mutex_lock(&zl_order_mutex); 3379 mutex_lock(&zl_order_mutex);
3380 if (write) { 3380 if (write) {
3381 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3381 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3382 ret = -EINVAL; 3382 ret = -EINVAL;
3383 goto out; 3383 goto out;
3384 } 3384 }
3385 strcpy(saved_string, (char *)table->data); 3385 strcpy(saved_string, (char *)table->data);
3386 } 3386 }
3387 ret = proc_dostring(table, write, buffer, length, ppos); 3387 ret = proc_dostring(table, write, buffer, length, ppos);
3388 if (ret) 3388 if (ret)
3389 goto out; 3389 goto out;
3390 if (write) { 3390 if (write) {
3391 int oldval = user_zonelist_order; 3391 int oldval = user_zonelist_order;
3392 3392
3393 ret = __parse_numa_zonelist_order((char *)table->data); 3393 ret = __parse_numa_zonelist_order((char *)table->data);
3394 if (ret) { 3394 if (ret) {
3395 /* 3395 /*
3396 * bogus value. restore saved string 3396 * bogus value. restore saved string
3397 */ 3397 */
3398 strncpy((char *)table->data, saved_string, 3398 strncpy((char *)table->data, saved_string,
3399 NUMA_ZONELIST_ORDER_LEN); 3399 NUMA_ZONELIST_ORDER_LEN);
3400 user_zonelist_order = oldval; 3400 user_zonelist_order = oldval;
3401 } else if (oldval != user_zonelist_order) { 3401 } else if (oldval != user_zonelist_order) {
3402 mutex_lock(&zonelists_mutex); 3402 mutex_lock(&zonelists_mutex);
3403 build_all_zonelists(NULL, NULL); 3403 build_all_zonelists(NULL, NULL);
3404 mutex_unlock(&zonelists_mutex); 3404 mutex_unlock(&zonelists_mutex);
3405 } 3405 }
3406 } 3406 }
3407 out: 3407 out:
3408 mutex_unlock(&zl_order_mutex); 3408 mutex_unlock(&zl_order_mutex);
3409 return ret; 3409 return ret;
3410 } 3410 }
3411 3411
3412 3412
3413 #define MAX_NODE_LOAD (nr_online_nodes) 3413 #define MAX_NODE_LOAD (nr_online_nodes)
3414 static int node_load[MAX_NUMNODES]; 3414 static int node_load[MAX_NUMNODES];
3415 3415
3416 /** 3416 /**
3417 * find_next_best_node - find the next node that should appear in a given node's fallback list 3417 * find_next_best_node - find the next node that should appear in a given node's fallback list
3418 * @node: node whose fallback list we're appending 3418 * @node: node whose fallback list we're appending
3419 * @used_node_mask: nodemask_t of already used nodes 3419 * @used_node_mask: nodemask_t of already used nodes
3420 * 3420 *
3421 * We use a number of factors to determine which is the next node that should 3421 * We use a number of factors to determine which is the next node that should
3422 * appear on a given node's fallback list. The node should not have appeared 3422 * appear on a given node's fallback list. The node should not have appeared
3423 * already in @node's fallback list, and it should be the next closest node 3423 * already in @node's fallback list, and it should be the next closest node
3424 * according to the distance array (which contains arbitrary distance values 3424 * according to the distance array (which contains arbitrary distance values
3425 * from each node to each node in the system), and should also prefer nodes 3425 * from each node to each node in the system), and should also prefer nodes
3426 * with no CPUs, since presumably they'll have very little allocation pressure 3426 * with no CPUs, since presumably they'll have very little allocation pressure
3427 * on them otherwise. 3427 * on them otherwise.
3428 * It returns -1 if no node is found. 3428 * It returns -1 if no node is found.
3429 */ 3429 */
3430 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3430 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3431 { 3431 {
3432 int n, val; 3432 int n, val;
3433 int min_val = INT_MAX; 3433 int min_val = INT_MAX;
3434 int best_node = NUMA_NO_NODE; 3434 int best_node = NUMA_NO_NODE;
3435 const struct cpumask *tmp = cpumask_of_node(0); 3435 const struct cpumask *tmp = cpumask_of_node(0);
3436 3436
3437 /* Use the local node if we haven't already */ 3437 /* Use the local node if we haven't already */
3438 if (!node_isset(node, *used_node_mask)) { 3438 if (!node_isset(node, *used_node_mask)) {
3439 node_set(node, *used_node_mask); 3439 node_set(node, *used_node_mask);
3440 return node; 3440 return node;
3441 } 3441 }
3442 3442
3443 for_each_node_state(n, N_MEMORY) { 3443 for_each_node_state(n, N_MEMORY) {
3444 3444
3445 /* Don't want a node to appear more than once */ 3445 /* Don't want a node to appear more than once */
3446 if (node_isset(n, *used_node_mask)) 3446 if (node_isset(n, *used_node_mask))
3447 continue; 3447 continue;
3448 3448
3449 /* Use the distance array to find the distance */ 3449 /* Use the distance array to find the distance */
3450 val = node_distance(node, n); 3450 val = node_distance(node, n);
3451 3451
3452 /* Penalize nodes under us ("prefer the next node") */ 3452 /* Penalize nodes under us ("prefer the next node") */
3453 val += (n < node); 3453 val += (n < node);
3454 3454
3455 /* Give preference to headless and unused nodes */ 3455 /* Give preference to headless and unused nodes */
3456 tmp = cpumask_of_node(n); 3456 tmp = cpumask_of_node(n);
3457 if (!cpumask_empty(tmp)) 3457 if (!cpumask_empty(tmp))
3458 val += PENALTY_FOR_NODE_WITH_CPUS; 3458 val += PENALTY_FOR_NODE_WITH_CPUS;
3459 3459
3460 /* Slight preference for less loaded node */ 3460 /* Slight preference for less loaded node */
3461 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3461 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3462 val += node_load[n]; 3462 val += node_load[n];
3463 3463
3464 if (val < min_val) { 3464 if (val < min_val) {
3465 min_val = val; 3465 min_val = val;
3466 best_node = n; 3466 best_node = n;
3467 } 3467 }
3468 } 3468 }
3469 3469
3470 if (best_node >= 0) 3470 if (best_node >= 0)
3471 node_set(best_node, *used_node_mask); 3471 node_set(best_node, *used_node_mask);
3472 3472
3473 return best_node; 3473 return best_node;
3474 } 3474 }
3475 3475
3476 3476
3477 /* 3477 /*
3478 * Build zonelists ordered by node and zones within node. 3478 * Build zonelists ordered by node and zones within node.
3479 * This results in maximum locality--normal zone overflows into local 3479 * This results in maximum locality--normal zone overflows into local
3480 * DMA zone, if any--but risks exhausting DMA zone. 3480 * DMA zone, if any--but risks exhausting DMA zone.
3481 */ 3481 */
3482 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3482 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3483 { 3483 {
3484 int j; 3484 int j;
3485 struct zonelist *zonelist; 3485 struct zonelist *zonelist;
3486 3486
3487 zonelist = &pgdat->node_zonelists[0]; 3487 zonelist = &pgdat->node_zonelists[0];
3488 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3488 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3489 ; 3489 ;
3490 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3490 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3491 zonelist->_zonerefs[j].zone = NULL; 3491 zonelist->_zonerefs[j].zone = NULL;
3492 zonelist->_zonerefs[j].zone_idx = 0; 3492 zonelist->_zonerefs[j].zone_idx = 0;
3493 } 3493 }
3494 3494
3495 /* 3495 /*
3496 * Build gfp_thisnode zonelists 3496 * Build gfp_thisnode zonelists
3497 */ 3497 */
3498 static void build_thisnode_zonelists(pg_data_t *pgdat) 3498 static void build_thisnode_zonelists(pg_data_t *pgdat)
3499 { 3499 {
3500 int j; 3500 int j;
3501 struct zonelist *zonelist; 3501 struct zonelist *zonelist;
3502 3502
3503 zonelist = &pgdat->node_zonelists[1]; 3503 zonelist = &pgdat->node_zonelists[1];
3504 j = build_zonelists_node(pgdat, zonelist, 0); 3504 j = build_zonelists_node(pgdat, zonelist, 0);
3505 zonelist->_zonerefs[j].zone = NULL; 3505 zonelist->_zonerefs[j].zone = NULL;
3506 zonelist->_zonerefs[j].zone_idx = 0; 3506 zonelist->_zonerefs[j].zone_idx = 0;
3507 } 3507 }
3508 3508
3509 /* 3509 /*
3510 * Build zonelists ordered by zone and nodes within zones. 3510 * Build zonelists ordered by zone and nodes within zones.
3511 * This results in conserving DMA zone[s] until all Normal memory is 3511 * This results in conserving DMA zone[s] until all Normal memory is
3512 * exhausted, but results in overflowing to remote node while memory 3512 * exhausted, but results in overflowing to remote node while memory
3513 * may still exist in local DMA zone. 3513 * may still exist in local DMA zone.
3514 */ 3514 */
3515 static int node_order[MAX_NUMNODES]; 3515 static int node_order[MAX_NUMNODES];
3516 3516
3517 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3517 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3518 { 3518 {
3519 int pos, j, node; 3519 int pos, j, node;
3520 int zone_type; /* needs to be signed */ 3520 int zone_type; /* needs to be signed */
3521 struct zone *z; 3521 struct zone *z;
3522 struct zonelist *zonelist; 3522 struct zonelist *zonelist;
3523 3523
3524 zonelist = &pgdat->node_zonelists[0]; 3524 zonelist = &pgdat->node_zonelists[0];
3525 pos = 0; 3525 pos = 0;
3526 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3526 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3527 for (j = 0; j < nr_nodes; j++) { 3527 for (j = 0; j < nr_nodes; j++) {
3528 node = node_order[j]; 3528 node = node_order[j];
3529 z = &NODE_DATA(node)->node_zones[zone_type]; 3529 z = &NODE_DATA(node)->node_zones[zone_type];
3530 if (populated_zone(z)) { 3530 if (populated_zone(z)) {
3531 zoneref_set_zone(z, 3531 zoneref_set_zone(z,
3532 &zonelist->_zonerefs[pos++]); 3532 &zonelist->_zonerefs[pos++]);
3533 check_highest_zone(zone_type); 3533 check_highest_zone(zone_type);
3534 } 3534 }
3535 } 3535 }
3536 } 3536 }
3537 zonelist->_zonerefs[pos].zone = NULL; 3537 zonelist->_zonerefs[pos].zone = NULL;
3538 zonelist->_zonerefs[pos].zone_idx = 0; 3538 zonelist->_zonerefs[pos].zone_idx = 0;
3539 } 3539 }
3540 3540
3541 static int default_zonelist_order(void) 3541 static int default_zonelist_order(void)
3542 { 3542 {
3543 int nid, zone_type; 3543 int nid, zone_type;
3544 unsigned long low_kmem_size, total_size; 3544 unsigned long low_kmem_size, total_size;
3545 struct zone *z; 3545 struct zone *z;
3546 int average_size; 3546 int average_size;
3547 /* 3547 /*
3548 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3548 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3549 * If they are really small and used heavily, the system can fall 3549 * If they are really small and used heavily, the system can fall
3550 * into OOM very easily. 3550 * into OOM very easily.
3551 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3551 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3552 */ 3552 */
3553 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3553 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3554 low_kmem_size = 0; 3554 low_kmem_size = 0;
3555 total_size = 0; 3555 total_size = 0;
3556 for_each_online_node(nid) { 3556 for_each_online_node(nid) {
3557 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3557 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3558 z = &NODE_DATA(nid)->node_zones[zone_type]; 3558 z = &NODE_DATA(nid)->node_zones[zone_type];
3559 if (populated_zone(z)) { 3559 if (populated_zone(z)) {
3560 if (zone_type < ZONE_NORMAL) 3560 if (zone_type < ZONE_NORMAL)
3561 low_kmem_size += z->managed_pages; 3561 low_kmem_size += z->managed_pages;
3562 total_size += z->managed_pages; 3562 total_size += z->managed_pages;
3563 } else if (zone_type == ZONE_NORMAL) { 3563 } else if (zone_type == ZONE_NORMAL) {
3564 /* 3564 /*
3565 * If any node has only lowmem, then node order 3565 * If any node has only lowmem, then node order
3566 * is preferred to allow kernel allocations 3566 * is preferred to allow kernel allocations
3567 * locally; otherwise, they can easily infringe 3567 * locally; otherwise, they can easily infringe
3568 * on other nodes when there is an abundance of 3568 * on other nodes when there is an abundance of
3569 * lowmem available to allocate from. 3569 * lowmem available to allocate from.
3570 */ 3570 */
3571 return ZONELIST_ORDER_NODE; 3571 return ZONELIST_ORDER_NODE;
3572 } 3572 }
3573 } 3573 }
3574 } 3574 }
3575 if (!low_kmem_size || /* there are no DMA area. */ 3575 if (!low_kmem_size || /* there are no DMA area. */
3576 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3576 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3577 return ZONELIST_ORDER_NODE; 3577 return ZONELIST_ORDER_NODE;
3578 /* 3578 /*
3579 * look into each node's config. 3579 * look into each node's config.
3580 * If there is a node whose DMA/DMA32 memory is very big area on 3580 * If there is a node whose DMA/DMA32 memory is very big area on
3581 * local memory, NODE_ORDER may be suitable. 3581 * local memory, NODE_ORDER may be suitable.
3582 */ 3582 */
3583 average_size = total_size / 3583 average_size = total_size /
3584 (nodes_weight(node_states[N_MEMORY]) + 1); 3584 (nodes_weight(node_states[N_MEMORY]) + 1);
3585 for_each_online_node(nid) { 3585 for_each_online_node(nid) {
3586 low_kmem_size = 0; 3586 low_kmem_size = 0;
3587 total_size = 0; 3587 total_size = 0;
3588 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3588 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3589 z = &NODE_DATA(nid)->node_zones[zone_type]; 3589 z = &NODE_DATA(nid)->node_zones[zone_type];
3590 if (populated_zone(z)) { 3590 if (populated_zone(z)) {
3591 if (zone_type < ZONE_NORMAL) 3591 if (zone_type < ZONE_NORMAL)
3592 low_kmem_size += z->present_pages; 3592 low_kmem_size += z->present_pages;
3593 total_size += z->present_pages; 3593 total_size += z->present_pages;
3594 } 3594 }
3595 } 3595 }
3596 if (low_kmem_size && 3596 if (low_kmem_size &&
3597 total_size > average_size && /* ignore small node */ 3597 total_size > average_size && /* ignore small node */
3598 low_kmem_size > total_size * 70/100) 3598 low_kmem_size > total_size * 70/100)
3599 return ZONELIST_ORDER_NODE; 3599 return ZONELIST_ORDER_NODE;
3600 } 3600 }
3601 return ZONELIST_ORDER_ZONE; 3601 return ZONELIST_ORDER_ZONE;
3602 } 3602 }
3603 3603
3604 static void set_zonelist_order(void) 3604 static void set_zonelist_order(void)
3605 { 3605 {
3606 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3606 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3607 current_zonelist_order = default_zonelist_order(); 3607 current_zonelist_order = default_zonelist_order();
3608 else 3608 else
3609 current_zonelist_order = user_zonelist_order; 3609 current_zonelist_order = user_zonelist_order;
3610 } 3610 }
3611 3611
3612 static void build_zonelists(pg_data_t *pgdat) 3612 static void build_zonelists(pg_data_t *pgdat)
3613 { 3613 {
3614 int j, node, load; 3614 int j, node, load;
3615 enum zone_type i; 3615 enum zone_type i;
3616 nodemask_t used_mask; 3616 nodemask_t used_mask;
3617 int local_node, prev_node; 3617 int local_node, prev_node;
3618 struct zonelist *zonelist; 3618 struct zonelist *zonelist;
3619 int order = current_zonelist_order; 3619 int order = current_zonelist_order;
3620 3620
3621 /* initialize zonelists */ 3621 /* initialize zonelists */
3622 for (i = 0; i < MAX_ZONELISTS; i++) { 3622 for (i = 0; i < MAX_ZONELISTS; i++) {
3623 zonelist = pgdat->node_zonelists + i; 3623 zonelist = pgdat->node_zonelists + i;
3624 zonelist->_zonerefs[0].zone = NULL; 3624 zonelist->_zonerefs[0].zone = NULL;
3625 zonelist->_zonerefs[0].zone_idx = 0; 3625 zonelist->_zonerefs[0].zone_idx = 0;
3626 } 3626 }
3627 3627
3628 /* NUMA-aware ordering of nodes */ 3628 /* NUMA-aware ordering of nodes */
3629 local_node = pgdat->node_id; 3629 local_node = pgdat->node_id;
3630 load = nr_online_nodes; 3630 load = nr_online_nodes;
3631 prev_node = local_node; 3631 prev_node = local_node;
3632 nodes_clear(used_mask); 3632 nodes_clear(used_mask);
3633 3633
3634 memset(node_order, 0, sizeof(node_order)); 3634 memset(node_order, 0, sizeof(node_order));
3635 j = 0; 3635 j = 0;
3636 3636
3637 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3637 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3638 /* 3638 /*
3639 * We don't want to pressure a particular node. 3639 * We don't want to pressure a particular node.
3640 * So adding penalty to the first node in same 3640 * So adding penalty to the first node in same
3641 * distance group to make it round-robin. 3641 * distance group to make it round-robin.
3642 */ 3642 */
3643 if (node_distance(local_node, node) != 3643 if (node_distance(local_node, node) !=
3644 node_distance(local_node, prev_node)) 3644 node_distance(local_node, prev_node))
3645 node_load[node] = load; 3645 node_load[node] = load;
3646 3646
3647 prev_node = node; 3647 prev_node = node;
3648 load--; 3648 load--;
3649 if (order == ZONELIST_ORDER_NODE) 3649 if (order == ZONELIST_ORDER_NODE)
3650 build_zonelists_in_node_order(pgdat, node); 3650 build_zonelists_in_node_order(pgdat, node);
3651 else 3651 else
3652 node_order[j++] = node; /* remember order */ 3652 node_order[j++] = node; /* remember order */
3653 } 3653 }
3654 3654
3655 if (order == ZONELIST_ORDER_ZONE) { 3655 if (order == ZONELIST_ORDER_ZONE) {
3656 /* calculate node order -- i.e., DMA last! */ 3656 /* calculate node order -- i.e., DMA last! */
3657 build_zonelists_in_zone_order(pgdat, j); 3657 build_zonelists_in_zone_order(pgdat, j);
3658 } 3658 }
3659 3659
3660 build_thisnode_zonelists(pgdat); 3660 build_thisnode_zonelists(pgdat);
3661 } 3661 }
3662 3662
3663 /* Construct the zonelist performance cache - see further mmzone.h */ 3663 /* Construct the zonelist performance cache - see further mmzone.h */
3664 static void build_zonelist_cache(pg_data_t *pgdat) 3664 static void build_zonelist_cache(pg_data_t *pgdat)
3665 { 3665 {
3666 struct zonelist *zonelist; 3666 struct zonelist *zonelist;
3667 struct zonelist_cache *zlc; 3667 struct zonelist_cache *zlc;
3668 struct zoneref *z; 3668 struct zoneref *z;
3669 3669
3670 zonelist = &pgdat->node_zonelists[0]; 3670 zonelist = &pgdat->node_zonelists[0];
3671 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3671 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3672 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3672 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3673 for (z = zonelist->_zonerefs; z->zone; z++) 3673 for (z = zonelist->_zonerefs; z->zone; z++)
3674 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3674 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3675 } 3675 }
3676 3676
3677 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3677 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3678 /* 3678 /*
3679 * Return node id of node used for "local" allocations. 3679 * Return node id of node used for "local" allocations.
3680 * I.e., first node id of first zone in arg node's generic zonelist. 3680 * I.e., first node id of first zone in arg node's generic zonelist.
3681 * Used for initializing percpu 'numa_mem', which is used primarily 3681 * Used for initializing percpu 'numa_mem', which is used primarily
3682 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3682 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3683 */ 3683 */
3684 int local_memory_node(int node) 3684 int local_memory_node(int node)
3685 { 3685 {
3686 struct zone *zone; 3686 struct zone *zone;
3687 3687
3688 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3688 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3689 gfp_zone(GFP_KERNEL), 3689 gfp_zone(GFP_KERNEL),
3690 NULL, 3690 NULL,
3691 &zone); 3691 &zone);
3692 return zone->node; 3692 return zone->node;
3693 } 3693 }
3694 #endif 3694 #endif
3695 3695
3696 #else /* CONFIG_NUMA */ 3696 #else /* CONFIG_NUMA */
3697 3697
3698 static void set_zonelist_order(void) 3698 static void set_zonelist_order(void)
3699 { 3699 {
3700 current_zonelist_order = ZONELIST_ORDER_ZONE; 3700 current_zonelist_order = ZONELIST_ORDER_ZONE;
3701 } 3701 }
3702 3702
3703 static void build_zonelists(pg_data_t *pgdat) 3703 static void build_zonelists(pg_data_t *pgdat)
3704 { 3704 {
3705 int node, local_node; 3705 int node, local_node;
3706 enum zone_type j; 3706 enum zone_type j;
3707 struct zonelist *zonelist; 3707 struct zonelist *zonelist;
3708 3708
3709 local_node = pgdat->node_id; 3709 local_node = pgdat->node_id;
3710 3710
3711 zonelist = &pgdat->node_zonelists[0]; 3711 zonelist = &pgdat->node_zonelists[0];
3712 j = build_zonelists_node(pgdat, zonelist, 0); 3712 j = build_zonelists_node(pgdat, zonelist, 0);
3713 3713
3714 /* 3714 /*
3715 * Now we build the zonelist so that it contains the zones 3715 * Now we build the zonelist so that it contains the zones
3716 * of all the other nodes. 3716 * of all the other nodes.
3717 * We don't want to pressure a particular node, so when 3717 * We don't want to pressure a particular node, so when
3718 * building the zones for node N, we make sure that the 3718 * building the zones for node N, we make sure that the
3719 * zones coming right after the local ones are those from 3719 * zones coming right after the local ones are those from
3720 * node N+1 (modulo N) 3720 * node N+1 (modulo N)
3721 */ 3721 */
3722 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3722 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3723 if (!node_online(node)) 3723 if (!node_online(node))
3724 continue; 3724 continue;
3725 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3725 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3726 } 3726 }
3727 for (node = 0; node < local_node; node++) { 3727 for (node = 0; node < local_node; node++) {
3728 if (!node_online(node)) 3728 if (!node_online(node))
3729 continue; 3729 continue;
3730 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3730 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3731 } 3731 }
3732 3732
3733 zonelist->_zonerefs[j].zone = NULL; 3733 zonelist->_zonerefs[j].zone = NULL;
3734 zonelist->_zonerefs[j].zone_idx = 0; 3734 zonelist->_zonerefs[j].zone_idx = 0;
3735 } 3735 }
3736 3736
3737 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3737 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3738 static void build_zonelist_cache(pg_data_t *pgdat) 3738 static void build_zonelist_cache(pg_data_t *pgdat)
3739 { 3739 {
3740 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3740 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3741 } 3741 }
3742 3742
3743 #endif /* CONFIG_NUMA */ 3743 #endif /* CONFIG_NUMA */
3744 3744
3745 /* 3745 /*
3746 * Boot pageset table. One per cpu which is going to be used for all 3746 * Boot pageset table. One per cpu which is going to be used for all
3747 * zones and all nodes. The parameters will be set in such a way 3747 * zones and all nodes. The parameters will be set in such a way
3748 * that an item put on a list will immediately be handed over to 3748 * that an item put on a list will immediately be handed over to
3749 * the buddy list. This is safe since pageset manipulation is done 3749 * the buddy list. This is safe since pageset manipulation is done
3750 * with interrupts disabled. 3750 * with interrupts disabled.
3751 * 3751 *
3752 * The boot_pagesets must be kept even after bootup is complete for 3752 * The boot_pagesets must be kept even after bootup is complete for
3753 * unused processors and/or zones. They do play a role for bootstrapping 3753 * unused processors and/or zones. They do play a role for bootstrapping
3754 * hotplugged processors. 3754 * hotplugged processors.
3755 * 3755 *
3756 * zoneinfo_show() and maybe other functions do 3756 * zoneinfo_show() and maybe other functions do
3757 * not check if the processor is online before following the pageset pointer. 3757 * not check if the processor is online before following the pageset pointer.
3758 * Other parts of the kernel may not check if the zone is available. 3758 * Other parts of the kernel may not check if the zone is available.
3759 */ 3759 */
3760 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3760 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3761 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3761 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3762 static void setup_zone_pageset(struct zone *zone); 3762 static void setup_zone_pageset(struct zone *zone);
3763 3763
3764 /* 3764 /*
3765 * Global mutex to protect against size modification of zonelists 3765 * Global mutex to protect against size modification of zonelists
3766 * as well as to serialize pageset setup for the new populated zone. 3766 * as well as to serialize pageset setup for the new populated zone.
3767 */ 3767 */
3768 DEFINE_MUTEX(zonelists_mutex); 3768 DEFINE_MUTEX(zonelists_mutex);
3769 3769
3770 /* return values int ....just for stop_machine() */ 3770 /* return values int ....just for stop_machine() */
3771 static int __build_all_zonelists(void *data) 3771 static int __build_all_zonelists(void *data)
3772 { 3772 {
3773 int nid; 3773 int nid;
3774 int cpu; 3774 int cpu;
3775 pg_data_t *self = data; 3775 pg_data_t *self = data;
3776 3776
3777 #ifdef CONFIG_NUMA 3777 #ifdef CONFIG_NUMA
3778 memset(node_load, 0, sizeof(node_load)); 3778 memset(node_load, 0, sizeof(node_load));
3779 #endif 3779 #endif
3780 3780
3781 if (self && !node_online(self->node_id)) { 3781 if (self && !node_online(self->node_id)) {
3782 build_zonelists(self); 3782 build_zonelists(self);
3783 build_zonelist_cache(self); 3783 build_zonelist_cache(self);
3784 } 3784 }
3785 3785
3786 for_each_online_node(nid) { 3786 for_each_online_node(nid) {
3787 pg_data_t *pgdat = NODE_DATA(nid); 3787 pg_data_t *pgdat = NODE_DATA(nid);
3788 3788
3789 build_zonelists(pgdat); 3789 build_zonelists(pgdat);
3790 build_zonelist_cache(pgdat); 3790 build_zonelist_cache(pgdat);
3791 } 3791 }
3792 3792
3793 /* 3793 /*
3794 * Initialize the boot_pagesets that are going to be used 3794 * Initialize the boot_pagesets that are going to be used
3795 * for bootstrapping processors. The real pagesets for 3795 * for bootstrapping processors. The real pagesets for
3796 * each zone will be allocated later when the per cpu 3796 * each zone will be allocated later when the per cpu
3797 * allocator is available. 3797 * allocator is available.
3798 * 3798 *
3799 * boot_pagesets are used also for bootstrapping offline 3799 * boot_pagesets are used also for bootstrapping offline
3800 * cpus if the system is already booted because the pagesets 3800 * cpus if the system is already booted because the pagesets
3801 * are needed to initialize allocators on a specific cpu too. 3801 * are needed to initialize allocators on a specific cpu too.
3802 * F.e. the percpu allocator needs the page allocator which 3802 * F.e. the percpu allocator needs the page allocator which
3803 * needs the percpu allocator in order to allocate its pagesets 3803 * needs the percpu allocator in order to allocate its pagesets
3804 * (a chicken-egg dilemma). 3804 * (a chicken-egg dilemma).
3805 */ 3805 */
3806 for_each_possible_cpu(cpu) { 3806 for_each_possible_cpu(cpu) {
3807 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3807 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3808 3808
3809 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3809 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3810 /* 3810 /*
3811 * We now know the "local memory node" for each node-- 3811 * We now know the "local memory node" for each node--
3812 * i.e., the node of the first zone in the generic zonelist. 3812 * i.e., the node of the first zone in the generic zonelist.
3813 * Set up numa_mem percpu variable for on-line cpus. During 3813 * Set up numa_mem percpu variable for on-line cpus. During
3814 * boot, only the boot cpu should be on-line; we'll init the 3814 * boot, only the boot cpu should be on-line; we'll init the
3815 * secondary cpus' numa_mem as they come on-line. During 3815 * secondary cpus' numa_mem as they come on-line. During
3816 * node/memory hotplug, we'll fixup all on-line cpus. 3816 * node/memory hotplug, we'll fixup all on-line cpus.
3817 */ 3817 */
3818 if (cpu_online(cpu)) 3818 if (cpu_online(cpu))
3819 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3819 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3820 #endif 3820 #endif
3821 } 3821 }
3822 3822
3823 return 0; 3823 return 0;
3824 } 3824 }
3825 3825
3826 /* 3826 /*
3827 * Called with zonelists_mutex held always 3827 * Called with zonelists_mutex held always
3828 * unless system_state == SYSTEM_BOOTING. 3828 * unless system_state == SYSTEM_BOOTING.
3829 */ 3829 */
3830 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3830 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3831 { 3831 {
3832 set_zonelist_order(); 3832 set_zonelist_order();
3833 3833
3834 if (system_state == SYSTEM_BOOTING) { 3834 if (system_state == SYSTEM_BOOTING) {
3835 __build_all_zonelists(NULL); 3835 __build_all_zonelists(NULL);
3836 mminit_verify_zonelist(); 3836 mminit_verify_zonelist();
3837 cpuset_init_current_mems_allowed(); 3837 cpuset_init_current_mems_allowed();
3838 } else { 3838 } else {
3839 #ifdef CONFIG_MEMORY_HOTPLUG 3839 #ifdef CONFIG_MEMORY_HOTPLUG
3840 if (zone) 3840 if (zone)
3841 setup_zone_pageset(zone); 3841 setup_zone_pageset(zone);
3842 #endif 3842 #endif
3843 /* we have to stop all cpus to guarantee there is no user 3843 /* we have to stop all cpus to guarantee there is no user
3844 of zonelist */ 3844 of zonelist */
3845 stop_machine(__build_all_zonelists, pgdat, NULL); 3845 stop_machine(__build_all_zonelists, pgdat, NULL);
3846 /* cpuset refresh routine should be here */ 3846 /* cpuset refresh routine should be here */
3847 } 3847 }
3848 vm_total_pages = nr_free_pagecache_pages(); 3848 vm_total_pages = nr_free_pagecache_pages();
3849 /* 3849 /*
3850 * Disable grouping by mobility if the number of pages in the 3850 * Disable grouping by mobility if the number of pages in the
3851 * system is too low to allow the mechanism to work. It would be 3851 * system is too low to allow the mechanism to work. It would be
3852 * more accurate, but expensive to check per-zone. This check is 3852 * more accurate, but expensive to check per-zone. This check is
3853 * made on memory-hotadd so a system can start with mobility 3853 * made on memory-hotadd so a system can start with mobility
3854 * disabled and enable it later 3854 * disabled and enable it later
3855 */ 3855 */
3856 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3856 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3857 page_group_by_mobility_disabled = 1; 3857 page_group_by_mobility_disabled = 1;
3858 else 3858 else
3859 page_group_by_mobility_disabled = 0; 3859 page_group_by_mobility_disabled = 0;
3860 3860
3861 printk("Built %i zonelists in %s order, mobility grouping %s. " 3861 printk("Built %i zonelists in %s order, mobility grouping %s. "
3862 "Total pages: %ld\n", 3862 "Total pages: %ld\n",
3863 nr_online_nodes, 3863 nr_online_nodes,
3864 zonelist_order_name[current_zonelist_order], 3864 zonelist_order_name[current_zonelist_order],
3865 page_group_by_mobility_disabled ? "off" : "on", 3865 page_group_by_mobility_disabled ? "off" : "on",
3866 vm_total_pages); 3866 vm_total_pages);
3867 #ifdef CONFIG_NUMA 3867 #ifdef CONFIG_NUMA
3868 printk("Policy zone: %s\n", zone_names[policy_zone]); 3868 printk("Policy zone: %s\n", zone_names[policy_zone]);
3869 #endif 3869 #endif
3870 } 3870 }
3871 3871
3872 /* 3872 /*
3873 * Helper functions to size the waitqueue hash table. 3873 * Helper functions to size the waitqueue hash table.
3874 * Essentially these want to choose hash table sizes sufficiently 3874 * Essentially these want to choose hash table sizes sufficiently
3875 * large so that collisions trying to wait on pages are rare. 3875 * large so that collisions trying to wait on pages are rare.
3876 * But in fact, the number of active page waitqueues on typical 3876 * But in fact, the number of active page waitqueues on typical
3877 * systems is ridiculously low, less than 200. So this is even 3877 * systems is ridiculously low, less than 200. So this is even
3878 * conservative, even though it seems large. 3878 * conservative, even though it seems large.
3879 * 3879 *
3880 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3880 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3881 * waitqueues, i.e. the size of the waitq table given the number of pages. 3881 * waitqueues, i.e. the size of the waitq table given the number of pages.
3882 */ 3882 */
3883 #define PAGES_PER_WAITQUEUE 256 3883 #define PAGES_PER_WAITQUEUE 256
3884 3884
3885 #ifndef CONFIG_MEMORY_HOTPLUG 3885 #ifndef CONFIG_MEMORY_HOTPLUG
3886 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3886 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3887 { 3887 {
3888 unsigned long size = 1; 3888 unsigned long size = 1;
3889 3889
3890 pages /= PAGES_PER_WAITQUEUE; 3890 pages /= PAGES_PER_WAITQUEUE;
3891 3891
3892 while (size < pages) 3892 while (size < pages)
3893 size <<= 1; 3893 size <<= 1;
3894 3894
3895 /* 3895 /*
3896 * Once we have dozens or even hundreds of threads sleeping 3896 * Once we have dozens or even hundreds of threads sleeping
3897 * on IO we've got bigger problems than wait queue collision. 3897 * on IO we've got bigger problems than wait queue collision.
3898 * Limit the size of the wait table to a reasonable size. 3898 * Limit the size of the wait table to a reasonable size.
3899 */ 3899 */
3900 size = min(size, 4096UL); 3900 size = min(size, 4096UL);
3901 3901
3902 return max(size, 4UL); 3902 return max(size, 4UL);
3903 } 3903 }
3904 #else 3904 #else
3905 /* 3905 /*
3906 * A zone's size might be changed by hot-add, so it is not possible to determine 3906 * A zone's size might be changed by hot-add, so it is not possible to determine
3907 * a suitable size for its wait_table. So we use the maximum size now. 3907 * a suitable size for its wait_table. So we use the maximum size now.
3908 * 3908 *
3909 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3909 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3910 * 3910 *
3911 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3911 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3912 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3912 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3913 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3913 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3914 * 3914 *
3915 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3915 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3916 * or more by the traditional way. (See above). It equals: 3916 * or more by the traditional way. (See above). It equals:
3917 * 3917 *
3918 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3918 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3919 * ia64(16K page size) : = ( 8G + 4M)byte. 3919 * ia64(16K page size) : = ( 8G + 4M)byte.
3920 * powerpc (64K page size) : = (32G +16M)byte. 3920 * powerpc (64K page size) : = (32G +16M)byte.
3921 */ 3921 */
3922 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3922 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3923 { 3923 {
3924 return 4096UL; 3924 return 4096UL;
3925 } 3925 }
3926 #endif 3926 #endif
3927 3927
3928 /* 3928 /*
3929 * This is an integer logarithm so that shifts can be used later 3929 * This is an integer logarithm so that shifts can be used later
3930 * to extract the more random high bits from the multiplicative 3930 * to extract the more random high bits from the multiplicative
3931 * hash function before the remainder is taken. 3931 * hash function before the remainder is taken.
3932 */ 3932 */
3933 static inline unsigned long wait_table_bits(unsigned long size) 3933 static inline unsigned long wait_table_bits(unsigned long size)
3934 { 3934 {
3935 return ffz(~size); 3935 return ffz(~size);
3936 } 3936 }
3937 3937
3938 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3938 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3939 3939
3940 /* 3940 /*
3941 * Check if a pageblock contains reserved pages 3941 * Check if a pageblock contains reserved pages
3942 */ 3942 */
3943 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3943 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3944 { 3944 {
3945 unsigned long pfn; 3945 unsigned long pfn;
3946 3946
3947 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3947 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3948 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3948 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3949 return 1; 3949 return 1;
3950 } 3950 }
3951 return 0; 3951 return 0;
3952 } 3952 }
3953 3953
3954 /* 3954 /*
3955 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3955 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3956 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3956 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3957 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3957 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3958 * higher will lead to a bigger reserve which will get freed as contiguous 3958 * higher will lead to a bigger reserve which will get freed as contiguous
3959 * blocks as reclaim kicks in 3959 * blocks as reclaim kicks in
3960 */ 3960 */
3961 static void setup_zone_migrate_reserve(struct zone *zone) 3961 static void setup_zone_migrate_reserve(struct zone *zone)
3962 { 3962 {
3963 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3963 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3964 struct page *page; 3964 struct page *page;
3965 unsigned long block_migratetype; 3965 unsigned long block_migratetype;
3966 int reserve; 3966 int reserve;
3967 int old_reserve; 3967 int old_reserve;
3968 3968
3969 /* 3969 /*
3970 * Get the start pfn, end pfn and the number of blocks to reserve 3970 * Get the start pfn, end pfn and the number of blocks to reserve
3971 * We have to be careful to be aligned to pageblock_nr_pages to 3971 * We have to be careful to be aligned to pageblock_nr_pages to
3972 * make sure that we always check pfn_valid for the first page in 3972 * make sure that we always check pfn_valid for the first page in
3973 * the block. 3973 * the block.
3974 */ 3974 */
3975 start_pfn = zone->zone_start_pfn; 3975 start_pfn = zone->zone_start_pfn;
3976 end_pfn = zone_end_pfn(zone); 3976 end_pfn = zone_end_pfn(zone);
3977 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3977 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3978 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3978 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3979 pageblock_order; 3979 pageblock_order;
3980 3980
3981 /* 3981 /*
3982 * Reserve blocks are generally in place to help high-order atomic 3982 * Reserve blocks are generally in place to help high-order atomic
3983 * allocations that are short-lived. A min_free_kbytes value that 3983 * allocations that are short-lived. A min_free_kbytes value that
3984 * would result in more than 2 reserve blocks for atomic allocations 3984 * would result in more than 2 reserve blocks for atomic allocations
3985 * is assumed to be in place to help anti-fragmentation for the 3985 * is assumed to be in place to help anti-fragmentation for the
3986 * future allocation of hugepages at runtime. 3986 * future allocation of hugepages at runtime.
3987 */ 3987 */
3988 reserve = min(2, reserve); 3988 reserve = min(2, reserve);
3989 old_reserve = zone->nr_migrate_reserve_block; 3989 old_reserve = zone->nr_migrate_reserve_block;
3990 3990
3991 /* When memory hot-add, we almost always need to do nothing */ 3991 /* When memory hot-add, we almost always need to do nothing */
3992 if (reserve == old_reserve) 3992 if (reserve == old_reserve)
3993 return; 3993 return;
3994 zone->nr_migrate_reserve_block = reserve; 3994 zone->nr_migrate_reserve_block = reserve;
3995 3995
3996 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3996 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3997 if (!pfn_valid(pfn)) 3997 if (!pfn_valid(pfn))
3998 continue; 3998 continue;
3999 page = pfn_to_page(pfn); 3999 page = pfn_to_page(pfn);
4000 4000
4001 /* Watch out for overlapping nodes */ 4001 /* Watch out for overlapping nodes */
4002 if (page_to_nid(page) != zone_to_nid(zone)) 4002 if (page_to_nid(page) != zone_to_nid(zone))
4003 continue; 4003 continue;
4004 4004
4005 block_migratetype = get_pageblock_migratetype(page); 4005 block_migratetype = get_pageblock_migratetype(page);
4006 4006
4007 /* Only test what is necessary when the reserves are not met */ 4007 /* Only test what is necessary when the reserves are not met */
4008 if (reserve > 0) { 4008 if (reserve > 0) {
4009 /* 4009 /*
4010 * Blocks with reserved pages will never free, skip 4010 * Blocks with reserved pages will never free, skip
4011 * them. 4011 * them.
4012 */ 4012 */
4013 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 4013 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
4014 if (pageblock_is_reserved(pfn, block_end_pfn)) 4014 if (pageblock_is_reserved(pfn, block_end_pfn))
4015 continue; 4015 continue;
4016 4016
4017 /* If this block is reserved, account for it */ 4017 /* If this block is reserved, account for it */
4018 if (block_migratetype == MIGRATE_RESERVE) { 4018 if (block_migratetype == MIGRATE_RESERVE) {
4019 reserve--; 4019 reserve--;
4020 continue; 4020 continue;
4021 } 4021 }
4022 4022
4023 /* Suitable for reserving if this block is movable */ 4023 /* Suitable for reserving if this block is movable */
4024 if (block_migratetype == MIGRATE_MOVABLE) { 4024 if (block_migratetype == MIGRATE_MOVABLE) {
4025 set_pageblock_migratetype(page, 4025 set_pageblock_migratetype(page,
4026 MIGRATE_RESERVE); 4026 MIGRATE_RESERVE);
4027 move_freepages_block(zone, page, 4027 move_freepages_block(zone, page,
4028 MIGRATE_RESERVE); 4028 MIGRATE_RESERVE);
4029 reserve--; 4029 reserve--;
4030 continue; 4030 continue;
4031 } 4031 }
4032 } else if (!old_reserve) { 4032 } else if (!old_reserve) {
4033 /* 4033 /*
4034 * At boot time we don't need to scan the whole zone 4034 * At boot time we don't need to scan the whole zone
4035 * for turning off MIGRATE_RESERVE. 4035 * for turning off MIGRATE_RESERVE.
4036 */ 4036 */
4037 break; 4037 break;
4038 } 4038 }
4039 4039
4040 /* 4040 /*
4041 * If the reserve is met and this is a previous reserved block, 4041 * If the reserve is met and this is a previous reserved block,
4042 * take it back 4042 * take it back
4043 */ 4043 */
4044 if (block_migratetype == MIGRATE_RESERVE) { 4044 if (block_migratetype == MIGRATE_RESERVE) {
4045 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4045 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4046 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4046 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4047 } 4047 }
4048 } 4048 }
4049 } 4049 }
4050 4050
4051 /* 4051 /*
4052 * Initially all pages are reserved - free ones are freed 4052 * Initially all pages are reserved - free ones are freed
4053 * up by free_all_bootmem() once the early boot process is 4053 * up by free_all_bootmem() once the early boot process is
4054 * done. Non-atomic initialization, single-pass. 4054 * done. Non-atomic initialization, single-pass.
4055 */ 4055 */
4056 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4056 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4057 unsigned long start_pfn, enum memmap_context context) 4057 unsigned long start_pfn, enum memmap_context context)
4058 { 4058 {
4059 struct page *page; 4059 struct page *page;
4060 unsigned long end_pfn = start_pfn + size; 4060 unsigned long end_pfn = start_pfn + size;
4061 unsigned long pfn; 4061 unsigned long pfn;
4062 struct zone *z; 4062 struct zone *z;
4063 4063
4064 if (highest_memmap_pfn < end_pfn - 1) 4064 if (highest_memmap_pfn < end_pfn - 1)
4065 highest_memmap_pfn = end_pfn - 1; 4065 highest_memmap_pfn = end_pfn - 1;
4066 4066
4067 z = &NODE_DATA(nid)->node_zones[zone]; 4067 z = &NODE_DATA(nid)->node_zones[zone];
4068 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4068 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4069 /* 4069 /*
4070 * There can be holes in boot-time mem_map[]s 4070 * There can be holes in boot-time mem_map[]s
4071 * handed to this function. They do not 4071 * handed to this function. They do not
4072 * exist on hotplugged memory. 4072 * exist on hotplugged memory.
4073 */ 4073 */
4074 if (context == MEMMAP_EARLY) { 4074 if (context == MEMMAP_EARLY) {
4075 if (!early_pfn_valid(pfn)) 4075 if (!early_pfn_valid(pfn))
4076 continue; 4076 continue;
4077 if (!early_pfn_in_nid(pfn, nid)) 4077 if (!early_pfn_in_nid(pfn, nid))
4078 continue; 4078 continue;
4079 } 4079 }
4080 page = pfn_to_page(pfn); 4080 page = pfn_to_page(pfn);
4081 set_page_links(page, zone, nid, pfn); 4081 set_page_links(page, zone, nid, pfn);
4082 mminit_verify_page_links(page, zone, nid, pfn); 4082 mminit_verify_page_links(page, zone, nid, pfn);
4083 init_page_count(page); 4083 init_page_count(page);
4084 page_mapcount_reset(page); 4084 page_mapcount_reset(page);
4085 page_nid_reset_last(page); 4085 page_nid_reset_last(page);
4086 SetPageReserved(page); 4086 SetPageReserved(page);
4087 /* 4087 /*
4088 * Mark the block movable so that blocks are reserved for 4088 * Mark the block movable so that blocks are reserved for
4089 * movable at startup. This will force kernel allocations 4089 * movable at startup. This will force kernel allocations
4090 * to reserve their blocks rather than leaking throughout 4090 * to reserve their blocks rather than leaking throughout
4091 * the address space during boot when many long-lived 4091 * the address space during boot when many long-lived
4092 * kernel allocations are made. Later some blocks near 4092 * kernel allocations are made. Later some blocks near
4093 * the start are marked MIGRATE_RESERVE by 4093 * the start are marked MIGRATE_RESERVE by
4094 * setup_zone_migrate_reserve() 4094 * setup_zone_migrate_reserve()
4095 * 4095 *
4096 * bitmap is created for zone's valid pfn range. but memmap 4096 * bitmap is created for zone's valid pfn range. but memmap
4097 * can be created for invalid pages (for alignment) 4097 * can be created for invalid pages (for alignment)
4098 * check here not to call set_pageblock_migratetype() against 4098 * check here not to call set_pageblock_migratetype() against
4099 * pfn out of zone. 4099 * pfn out of zone.
4100 */ 4100 */
4101 if ((z->zone_start_pfn <= pfn) 4101 if ((z->zone_start_pfn <= pfn)
4102 && (pfn < zone_end_pfn(z)) 4102 && (pfn < zone_end_pfn(z))
4103 && !(pfn & (pageblock_nr_pages - 1))) 4103 && !(pfn & (pageblock_nr_pages - 1)))
4104 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4104 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4105 4105
4106 INIT_LIST_HEAD(&page->lru); 4106 INIT_LIST_HEAD(&page->lru);
4107 #ifdef WANT_PAGE_VIRTUAL 4107 #ifdef WANT_PAGE_VIRTUAL
4108 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4108 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4109 if (!is_highmem_idx(zone)) 4109 if (!is_highmem_idx(zone))
4110 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4110 set_page_address(page, __va(pfn << PAGE_SHIFT));
4111 #endif 4111 #endif
4112 } 4112 }
4113 } 4113 }
4114 4114
4115 static void __meminit zone_init_free_lists(struct zone *zone) 4115 static void __meminit zone_init_free_lists(struct zone *zone)
4116 { 4116 {
4117 unsigned int order, t; 4117 unsigned int order, t;
4118 for_each_migratetype_order(order, t) { 4118 for_each_migratetype_order(order, t) {
4119 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4119 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4120 zone->free_area[order].nr_free = 0; 4120 zone->free_area[order].nr_free = 0;
4121 } 4121 }
4122 } 4122 }
4123 4123
4124 #ifndef __HAVE_ARCH_MEMMAP_INIT 4124 #ifndef __HAVE_ARCH_MEMMAP_INIT
4125 #define memmap_init(size, nid, zone, start_pfn) \ 4125 #define memmap_init(size, nid, zone, start_pfn) \
4126 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4126 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4127 #endif 4127 #endif
4128 4128
4129 static int zone_batchsize(struct zone *zone) 4129 static int zone_batchsize(struct zone *zone)
4130 { 4130 {
4131 #ifdef CONFIG_MMU 4131 #ifdef CONFIG_MMU
4132 int batch; 4132 int batch;
4133 4133
4134 /* 4134 /*
4135 * The per-cpu-pages pools are set to around 1000th of the 4135 * The per-cpu-pages pools are set to around 1000th of the
4136 * size of the zone. But no more than 1/2 of a meg. 4136 * size of the zone. But no more than 1/2 of a meg.
4137 * 4137 *
4138 * OK, so we don't know how big the cache is. So guess. 4138 * OK, so we don't know how big the cache is. So guess.
4139 */ 4139 */
4140 batch = zone->managed_pages / 1024; 4140 batch = zone->managed_pages / 1024;
4141 if (batch * PAGE_SIZE > 512 * 1024) 4141 if (batch * PAGE_SIZE > 512 * 1024)
4142 batch = (512 * 1024) / PAGE_SIZE; 4142 batch = (512 * 1024) / PAGE_SIZE;
4143 batch /= 4; /* We effectively *= 4 below */ 4143 batch /= 4; /* We effectively *= 4 below */
4144 if (batch < 1) 4144 if (batch < 1)
4145 batch = 1; 4145 batch = 1;
4146 4146
4147 /* 4147 /*
4148 * Clamp the batch to a 2^n - 1 value. Having a power 4148 * Clamp the batch to a 2^n - 1 value. Having a power
4149 * of 2 value was found to be more likely to have 4149 * of 2 value was found to be more likely to have
4150 * suboptimal cache aliasing properties in some cases. 4150 * suboptimal cache aliasing properties in some cases.
4151 * 4151 *
4152 * For example if 2 tasks are alternately allocating 4152 * For example if 2 tasks are alternately allocating
4153 * batches of pages, one task can end up with a lot 4153 * batches of pages, one task can end up with a lot
4154 * of pages of one half of the possible page colors 4154 * of pages of one half of the possible page colors
4155 * and the other with pages of the other colors. 4155 * and the other with pages of the other colors.
4156 */ 4156 */
4157 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4157 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4158 4158
4159 return batch; 4159 return batch;
4160 4160
4161 #else 4161 #else
4162 /* The deferral and batching of frees should be suppressed under NOMMU 4162 /* The deferral and batching of frees should be suppressed under NOMMU
4163 * conditions. 4163 * conditions.
4164 * 4164 *
4165 * The problem is that NOMMU needs to be able to allocate large chunks 4165 * The problem is that NOMMU needs to be able to allocate large chunks
4166 * of contiguous memory as there's no hardware page translation to 4166 * of contiguous memory as there's no hardware page translation to
4167 * assemble apparent contiguous memory from discontiguous pages. 4167 * assemble apparent contiguous memory from discontiguous pages.
4168 * 4168 *
4169 * Queueing large contiguous runs of pages for batching, however, 4169 * Queueing large contiguous runs of pages for batching, however,
4170 * causes the pages to actually be freed in smaller chunks. As there 4170 * causes the pages to actually be freed in smaller chunks. As there
4171 * can be a significant delay between the individual batches being 4171 * can be a significant delay between the individual batches being
4172 * recycled, this leads to the once large chunks of space being 4172 * recycled, this leads to the once large chunks of space being
4173 * fragmented and becoming unavailable for high-order allocations. 4173 * fragmented and becoming unavailable for high-order allocations.
4174 */ 4174 */
4175 return 0; 4175 return 0;
4176 #endif 4176 #endif
4177 } 4177 }
4178 4178
4179 /* 4179 /*
4180 * pcp->high and pcp->batch values are related and dependent on one another: 4180 * pcp->high and pcp->batch values are related and dependent on one another:
4181 * ->batch must never be higher then ->high. 4181 * ->batch must never be higher then ->high.
4182 * The following function updates them in a safe manner without read side 4182 * The following function updates them in a safe manner without read side
4183 * locking. 4183 * locking.
4184 * 4184 *
4185 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4185 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4186 * those fields changing asynchronously (acording the the above rule). 4186 * those fields changing asynchronously (acording the the above rule).
4187 * 4187 *
4188 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4188 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4189 * outside of boot time (or some other assurance that no concurrent updaters 4189 * outside of boot time (or some other assurance that no concurrent updaters
4190 * exist). 4190 * exist).
4191 */ 4191 */
4192 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4192 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4193 unsigned long batch) 4193 unsigned long batch)
4194 { 4194 {
4195 /* start with a fail safe value for batch */ 4195 /* start with a fail safe value for batch */
4196 pcp->batch = 1; 4196 pcp->batch = 1;
4197 smp_wmb(); 4197 smp_wmb();
4198 4198
4199 /* Update high, then batch, in order */ 4199 /* Update high, then batch, in order */
4200 pcp->high = high; 4200 pcp->high = high;
4201 smp_wmb(); 4201 smp_wmb();
4202 4202
4203 pcp->batch = batch; 4203 pcp->batch = batch;
4204 } 4204 }
4205 4205
4206 /* a companion to pageset_set_high() */ 4206 /* a companion to pageset_set_high() */
4207 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4207 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4208 { 4208 {
4209 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4209 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4210 } 4210 }
4211 4211
4212 static void pageset_init(struct per_cpu_pageset *p) 4212 static void pageset_init(struct per_cpu_pageset *p)
4213 { 4213 {
4214 struct per_cpu_pages *pcp; 4214 struct per_cpu_pages *pcp;
4215 int migratetype; 4215 int migratetype;
4216 4216
4217 memset(p, 0, sizeof(*p)); 4217 memset(p, 0, sizeof(*p));
4218 4218
4219 pcp = &p->pcp; 4219 pcp = &p->pcp;
4220 pcp->count = 0; 4220 pcp->count = 0;
4221 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4221 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4222 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4222 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4223 } 4223 }
4224 4224
4225 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4225 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4226 { 4226 {
4227 pageset_init(p); 4227 pageset_init(p);
4228 pageset_set_batch(p, batch); 4228 pageset_set_batch(p, batch);
4229 } 4229 }
4230 4230
4231 /* 4231 /*
4232 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4232 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4233 * to the value high for the pageset p. 4233 * to the value high for the pageset p.
4234 */ 4234 */
4235 static void pageset_set_high(struct per_cpu_pageset *p, 4235 static void pageset_set_high(struct per_cpu_pageset *p,
4236 unsigned long high) 4236 unsigned long high)
4237 { 4237 {
4238 unsigned long batch = max(1UL, high / 4); 4238 unsigned long batch = max(1UL, high / 4);
4239 if ((high / 4) > (PAGE_SHIFT * 8)) 4239 if ((high / 4) > (PAGE_SHIFT * 8))
4240 batch = PAGE_SHIFT * 8; 4240 batch = PAGE_SHIFT * 8;
4241 4241
4242 pageset_update(&p->pcp, high, batch); 4242 pageset_update(&p->pcp, high, batch);
4243 } 4243 }
4244 4244
4245 static void pageset_set_high_and_batch(struct zone *zone, 4245 static void pageset_set_high_and_batch(struct zone *zone,
4246 struct per_cpu_pageset *pcp) 4246 struct per_cpu_pageset *pcp)
4247 { 4247 {
4248 if (percpu_pagelist_fraction) 4248 if (percpu_pagelist_fraction)
4249 pageset_set_high(pcp, 4249 pageset_set_high(pcp,
4250 (zone->managed_pages / 4250 (zone->managed_pages /
4251 percpu_pagelist_fraction)); 4251 percpu_pagelist_fraction));
4252 else 4252 else
4253 pageset_set_batch(pcp, zone_batchsize(zone)); 4253 pageset_set_batch(pcp, zone_batchsize(zone));
4254 } 4254 }
4255 4255
4256 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4256 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4257 { 4257 {
4258 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4258 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4259 4259
4260 pageset_init(pcp); 4260 pageset_init(pcp);
4261 pageset_set_high_and_batch(zone, pcp); 4261 pageset_set_high_and_batch(zone, pcp);
4262 } 4262 }
4263 4263
4264 static void __meminit setup_zone_pageset(struct zone *zone) 4264 static void __meminit setup_zone_pageset(struct zone *zone)
4265 { 4265 {
4266 int cpu; 4266 int cpu;
4267 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4267 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4268 for_each_possible_cpu(cpu) 4268 for_each_possible_cpu(cpu)
4269 zone_pageset_init(zone, cpu); 4269 zone_pageset_init(zone, cpu);
4270 } 4270 }
4271 4271
4272 /* 4272 /*
4273 * Allocate per cpu pagesets and initialize them. 4273 * Allocate per cpu pagesets and initialize them.
4274 * Before this call only boot pagesets were available. 4274 * Before this call only boot pagesets were available.
4275 */ 4275 */
4276 void __init setup_per_cpu_pageset(void) 4276 void __init setup_per_cpu_pageset(void)
4277 { 4277 {
4278 struct zone *zone; 4278 struct zone *zone;
4279 4279
4280 for_each_populated_zone(zone) 4280 for_each_populated_zone(zone)
4281 setup_zone_pageset(zone); 4281 setup_zone_pageset(zone);
4282 } 4282 }
4283 4283
4284 static noinline __init_refok 4284 static noinline __init_refok
4285 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4285 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4286 { 4286 {
4287 int i; 4287 int i;
4288 struct pglist_data *pgdat = zone->zone_pgdat; 4288 struct pglist_data *pgdat = zone->zone_pgdat;
4289 size_t alloc_size; 4289 size_t alloc_size;
4290 4290
4291 /* 4291 /*
4292 * The per-page waitqueue mechanism uses hashed waitqueues 4292 * The per-page waitqueue mechanism uses hashed waitqueues
4293 * per zone. 4293 * per zone.
4294 */ 4294 */
4295 zone->wait_table_hash_nr_entries = 4295 zone->wait_table_hash_nr_entries =
4296 wait_table_hash_nr_entries(zone_size_pages); 4296 wait_table_hash_nr_entries(zone_size_pages);
4297 zone->wait_table_bits = 4297 zone->wait_table_bits =
4298 wait_table_bits(zone->wait_table_hash_nr_entries); 4298 wait_table_bits(zone->wait_table_hash_nr_entries);
4299 alloc_size = zone->wait_table_hash_nr_entries 4299 alloc_size = zone->wait_table_hash_nr_entries
4300 * sizeof(wait_queue_head_t); 4300 * sizeof(wait_queue_head_t);
4301 4301
4302 if (!slab_is_available()) { 4302 if (!slab_is_available()) {
4303 zone->wait_table = (wait_queue_head_t *) 4303 zone->wait_table = (wait_queue_head_t *)
4304 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4304 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4305 } else { 4305 } else {
4306 /* 4306 /*
4307 * This case means that a zone whose size was 0 gets new memory 4307 * This case means that a zone whose size was 0 gets new memory
4308 * via memory hot-add. 4308 * via memory hot-add.
4309 * But it may be the case that a new node was hot-added. In 4309 * But it may be the case that a new node was hot-added. In
4310 * this case vmalloc() will not be able to use this new node's 4310 * this case vmalloc() will not be able to use this new node's
4311 * memory - this wait_table must be initialized to use this new 4311 * memory - this wait_table must be initialized to use this new
4312 * node itself as well. 4312 * node itself as well.
4313 * To use this new node's memory, further consideration will be 4313 * To use this new node's memory, further consideration will be
4314 * necessary. 4314 * necessary.
4315 */ 4315 */
4316 zone->wait_table = vmalloc(alloc_size); 4316 zone->wait_table = vmalloc(alloc_size);
4317 } 4317 }
4318 if (!zone->wait_table) 4318 if (!zone->wait_table)
4319 return -ENOMEM; 4319 return -ENOMEM;
4320 4320
4321 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4321 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4322 init_waitqueue_head(zone->wait_table + i); 4322 init_waitqueue_head(zone->wait_table + i);
4323 4323
4324 return 0; 4324 return 0;
4325 } 4325 }
4326 4326
4327 static __meminit void zone_pcp_init(struct zone *zone) 4327 static __meminit void zone_pcp_init(struct zone *zone)
4328 { 4328 {
4329 /* 4329 /*
4330 * per cpu subsystem is not up at this point. The following code 4330 * per cpu subsystem is not up at this point. The following code
4331 * relies on the ability of the linker to provide the 4331 * relies on the ability of the linker to provide the
4332 * offset of a (static) per cpu variable into the per cpu area. 4332 * offset of a (static) per cpu variable into the per cpu area.
4333 */ 4333 */
4334 zone->pageset = &boot_pageset; 4334 zone->pageset = &boot_pageset;
4335 4335
4336 if (zone->present_pages) 4336 if (zone->present_pages)
4337 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4337 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4338 zone->name, zone->present_pages, 4338 zone->name, zone->present_pages,
4339 zone_batchsize(zone)); 4339 zone_batchsize(zone));
4340 } 4340 }
4341 4341
4342 int __meminit init_currently_empty_zone(struct zone *zone, 4342 int __meminit init_currently_empty_zone(struct zone *zone,
4343 unsigned long zone_start_pfn, 4343 unsigned long zone_start_pfn,
4344 unsigned long size, 4344 unsigned long size,
4345 enum memmap_context context) 4345 enum memmap_context context)
4346 { 4346 {
4347 struct pglist_data *pgdat = zone->zone_pgdat; 4347 struct pglist_data *pgdat = zone->zone_pgdat;
4348 int ret; 4348 int ret;
4349 ret = zone_wait_table_init(zone, size); 4349 ret = zone_wait_table_init(zone, size);
4350 if (ret) 4350 if (ret)
4351 return ret; 4351 return ret;
4352 pgdat->nr_zones = zone_idx(zone) + 1; 4352 pgdat->nr_zones = zone_idx(zone) + 1;
4353 4353
4354 zone->zone_start_pfn = zone_start_pfn; 4354 zone->zone_start_pfn = zone_start_pfn;
4355 4355
4356 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4356 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4357 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4357 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4358 pgdat->node_id, 4358 pgdat->node_id,
4359 (unsigned long)zone_idx(zone), 4359 (unsigned long)zone_idx(zone),
4360 zone_start_pfn, (zone_start_pfn + size)); 4360 zone_start_pfn, (zone_start_pfn + size));
4361 4361
4362 zone_init_free_lists(zone); 4362 zone_init_free_lists(zone);
4363 4363
4364 return 0; 4364 return 0;
4365 } 4365 }
4366 4366
4367 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4367 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4368 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4368 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4369 /* 4369 /*
4370 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4370 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4371 * Architectures may implement their own version but if add_active_range() 4371 * Architectures may implement their own version but if add_active_range()
4372 * was used and there are no special requirements, this is a convenient 4372 * was used and there are no special requirements, this is a convenient
4373 * alternative 4373 * alternative
4374 */ 4374 */
4375 int __meminit __early_pfn_to_nid(unsigned long pfn) 4375 int __meminit __early_pfn_to_nid(unsigned long pfn)
4376 { 4376 {
4377 unsigned long start_pfn, end_pfn; 4377 unsigned long start_pfn, end_pfn;
4378 int nid; 4378 int nid;
4379 /* 4379 /*
4380 * NOTE: The following SMP-unsafe globals are only used early in boot 4380 * NOTE: The following SMP-unsafe globals are only used early in boot
4381 * when the kernel is running single-threaded. 4381 * when the kernel is running single-threaded.
4382 */ 4382 */
4383 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4383 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4384 static int __meminitdata last_nid; 4384 static int __meminitdata last_nid;
4385 4385
4386 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4386 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4387 return last_nid; 4387 return last_nid;
4388 4388
4389 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4389 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4390 if (nid != -1) { 4390 if (nid != -1) {
4391 last_start_pfn = start_pfn; 4391 last_start_pfn = start_pfn;
4392 last_end_pfn = end_pfn; 4392 last_end_pfn = end_pfn;
4393 last_nid = nid; 4393 last_nid = nid;
4394 } 4394 }
4395 4395
4396 return nid; 4396 return nid;
4397 } 4397 }
4398 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4398 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4399 4399
4400 int __meminit early_pfn_to_nid(unsigned long pfn) 4400 int __meminit early_pfn_to_nid(unsigned long pfn)
4401 { 4401 {
4402 int nid; 4402 int nid;
4403 4403
4404 nid = __early_pfn_to_nid(pfn); 4404 nid = __early_pfn_to_nid(pfn);
4405 if (nid >= 0) 4405 if (nid >= 0)
4406 return nid; 4406 return nid;
4407 /* just returns 0 */ 4407 /* just returns 0 */
4408 return 0; 4408 return 0;
4409 } 4409 }
4410 4410
4411 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4411 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4412 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4412 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4413 { 4413 {
4414 int nid; 4414 int nid;
4415 4415
4416 nid = __early_pfn_to_nid(pfn); 4416 nid = __early_pfn_to_nid(pfn);
4417 if (nid >= 0 && nid != node) 4417 if (nid >= 0 && nid != node)
4418 return false; 4418 return false;
4419 return true; 4419 return true;
4420 } 4420 }
4421 #endif 4421 #endif
4422 4422
4423 /** 4423 /**
4424 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4424 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4425 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4425 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4426 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4426 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4427 * 4427 *
4428 * If an architecture guarantees that all ranges registered with 4428 * If an architecture guarantees that all ranges registered with
4429 * add_active_ranges() contain no holes and may be freed, this 4429 * add_active_ranges() contain no holes and may be freed, this
4430 * this function may be used instead of calling free_bootmem() manually. 4430 * this function may be used instead of calling free_bootmem() manually.
4431 */ 4431 */
4432 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4432 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4433 { 4433 {
4434 unsigned long start_pfn, end_pfn; 4434 unsigned long start_pfn, end_pfn;
4435 int i, this_nid; 4435 int i, this_nid;
4436 4436
4437 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4437 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4438 start_pfn = min(start_pfn, max_low_pfn); 4438 start_pfn = min(start_pfn, max_low_pfn);
4439 end_pfn = min(end_pfn, max_low_pfn); 4439 end_pfn = min(end_pfn, max_low_pfn);
4440 4440
4441 if (start_pfn < end_pfn) 4441 if (start_pfn < end_pfn)
4442 free_bootmem_node(NODE_DATA(this_nid), 4442 free_bootmem_node(NODE_DATA(this_nid),
4443 PFN_PHYS(start_pfn), 4443 PFN_PHYS(start_pfn),
4444 (end_pfn - start_pfn) << PAGE_SHIFT); 4444 (end_pfn - start_pfn) << PAGE_SHIFT);
4445 } 4445 }
4446 } 4446 }
4447 4447
4448 /** 4448 /**
4449 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4449 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4450 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4450 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4451 * 4451 *
4452 * If an architecture guarantees that all ranges registered with 4452 * If an architecture guarantees that all ranges registered with
4453 * add_active_ranges() contain no holes and may be freed, this 4453 * add_active_ranges() contain no holes and may be freed, this
4454 * function may be used instead of calling memory_present() manually. 4454 * function may be used instead of calling memory_present() manually.
4455 */ 4455 */
4456 void __init sparse_memory_present_with_active_regions(int nid) 4456 void __init sparse_memory_present_with_active_regions(int nid)
4457 { 4457 {
4458 unsigned long start_pfn, end_pfn; 4458 unsigned long start_pfn, end_pfn;
4459 int i, this_nid; 4459 int i, this_nid;
4460 4460
4461 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4461 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4462 memory_present(this_nid, start_pfn, end_pfn); 4462 memory_present(this_nid, start_pfn, end_pfn);
4463 } 4463 }
4464 4464
4465 /** 4465 /**
4466 * get_pfn_range_for_nid - Return the start and end page frames for a node 4466 * get_pfn_range_for_nid - Return the start and end page frames for a node
4467 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4467 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4468 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4468 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4469 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4469 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4470 * 4470 *
4471 * It returns the start and end page frame of a node based on information 4471 * It returns the start and end page frame of a node based on information
4472 * provided by an arch calling add_active_range(). If called for a node 4472 * provided by an arch calling add_active_range(). If called for a node
4473 * with no available memory, a warning is printed and the start and end 4473 * with no available memory, a warning is printed and the start and end
4474 * PFNs will be 0. 4474 * PFNs will be 0.
4475 */ 4475 */
4476 void __meminit get_pfn_range_for_nid(unsigned int nid, 4476 void __meminit get_pfn_range_for_nid(unsigned int nid,
4477 unsigned long *start_pfn, unsigned long *end_pfn) 4477 unsigned long *start_pfn, unsigned long *end_pfn)
4478 { 4478 {
4479 unsigned long this_start_pfn, this_end_pfn; 4479 unsigned long this_start_pfn, this_end_pfn;
4480 int i; 4480 int i;
4481 4481
4482 *start_pfn = -1UL; 4482 *start_pfn = -1UL;
4483 *end_pfn = 0; 4483 *end_pfn = 0;
4484 4484
4485 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4485 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4486 *start_pfn = min(*start_pfn, this_start_pfn); 4486 *start_pfn = min(*start_pfn, this_start_pfn);
4487 *end_pfn = max(*end_pfn, this_end_pfn); 4487 *end_pfn = max(*end_pfn, this_end_pfn);
4488 } 4488 }
4489 4489
4490 if (*start_pfn == -1UL) 4490 if (*start_pfn == -1UL)
4491 *start_pfn = 0; 4491 *start_pfn = 0;
4492 } 4492 }
4493 4493
4494 /* 4494 /*
4495 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4495 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4496 * assumption is made that zones within a node are ordered in monotonic 4496 * assumption is made that zones within a node are ordered in monotonic
4497 * increasing memory addresses so that the "highest" populated zone is used 4497 * increasing memory addresses so that the "highest" populated zone is used
4498 */ 4498 */
4499 static void __init find_usable_zone_for_movable(void) 4499 static void __init find_usable_zone_for_movable(void)
4500 { 4500 {
4501 int zone_index; 4501 int zone_index;
4502 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4502 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4503 if (zone_index == ZONE_MOVABLE) 4503 if (zone_index == ZONE_MOVABLE)
4504 continue; 4504 continue;
4505 4505
4506 if (arch_zone_highest_possible_pfn[zone_index] > 4506 if (arch_zone_highest_possible_pfn[zone_index] >
4507 arch_zone_lowest_possible_pfn[zone_index]) 4507 arch_zone_lowest_possible_pfn[zone_index])
4508 break; 4508 break;
4509 } 4509 }
4510 4510
4511 VM_BUG_ON(zone_index == -1); 4511 VM_BUG_ON(zone_index == -1);
4512 movable_zone = zone_index; 4512 movable_zone = zone_index;
4513 } 4513 }
4514 4514
4515 /* 4515 /*
4516 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4516 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4517 * because it is sized independent of architecture. Unlike the other zones, 4517 * because it is sized independent of architecture. Unlike the other zones,
4518 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4518 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4519 * in each node depending on the size of each node and how evenly kernelcore 4519 * in each node depending on the size of each node and how evenly kernelcore
4520 * is distributed. This helper function adjusts the zone ranges 4520 * is distributed. This helper function adjusts the zone ranges
4521 * provided by the architecture for a given node by using the end of the 4521 * provided by the architecture for a given node by using the end of the
4522 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4522 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4523 * zones within a node are in order of monotonic increases memory addresses 4523 * zones within a node are in order of monotonic increases memory addresses
4524 */ 4524 */
4525 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4525 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4526 unsigned long zone_type, 4526 unsigned long zone_type,
4527 unsigned long node_start_pfn, 4527 unsigned long node_start_pfn,
4528 unsigned long node_end_pfn, 4528 unsigned long node_end_pfn,
4529 unsigned long *zone_start_pfn, 4529 unsigned long *zone_start_pfn,
4530 unsigned long *zone_end_pfn) 4530 unsigned long *zone_end_pfn)
4531 { 4531 {
4532 /* Only adjust if ZONE_MOVABLE is on this node */ 4532 /* Only adjust if ZONE_MOVABLE is on this node */
4533 if (zone_movable_pfn[nid]) { 4533 if (zone_movable_pfn[nid]) {
4534 /* Size ZONE_MOVABLE */ 4534 /* Size ZONE_MOVABLE */
4535 if (zone_type == ZONE_MOVABLE) { 4535 if (zone_type == ZONE_MOVABLE) {
4536 *zone_start_pfn = zone_movable_pfn[nid]; 4536 *zone_start_pfn = zone_movable_pfn[nid];
4537 *zone_end_pfn = min(node_end_pfn, 4537 *zone_end_pfn = min(node_end_pfn,
4538 arch_zone_highest_possible_pfn[movable_zone]); 4538 arch_zone_highest_possible_pfn[movable_zone]);
4539 4539
4540 /* Adjust for ZONE_MOVABLE starting within this range */ 4540 /* Adjust for ZONE_MOVABLE starting within this range */
4541 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4541 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4542 *zone_end_pfn > zone_movable_pfn[nid]) { 4542 *zone_end_pfn > zone_movable_pfn[nid]) {
4543 *zone_end_pfn = zone_movable_pfn[nid]; 4543 *zone_end_pfn = zone_movable_pfn[nid];
4544 4544
4545 /* Check if this whole range is within ZONE_MOVABLE */ 4545 /* Check if this whole range is within ZONE_MOVABLE */
4546 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4546 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4547 *zone_start_pfn = *zone_end_pfn; 4547 *zone_start_pfn = *zone_end_pfn;
4548 } 4548 }
4549 } 4549 }
4550 4550
4551 /* 4551 /*
4552 * Return the number of pages a zone spans in a node, including holes 4552 * Return the number of pages a zone spans in a node, including holes
4553 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4553 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4554 */ 4554 */
4555 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4555 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4556 unsigned long zone_type, 4556 unsigned long zone_type,
4557 unsigned long node_start_pfn, 4557 unsigned long node_start_pfn,
4558 unsigned long node_end_pfn, 4558 unsigned long node_end_pfn,
4559 unsigned long *ignored) 4559 unsigned long *ignored)
4560 { 4560 {
4561 unsigned long zone_start_pfn, zone_end_pfn; 4561 unsigned long zone_start_pfn, zone_end_pfn;
4562 4562
4563 /* Get the start and end of the zone */ 4563 /* Get the start and end of the zone */
4564 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4564 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4565 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4565 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4566 adjust_zone_range_for_zone_movable(nid, zone_type, 4566 adjust_zone_range_for_zone_movable(nid, zone_type,
4567 node_start_pfn, node_end_pfn, 4567 node_start_pfn, node_end_pfn,
4568 &zone_start_pfn, &zone_end_pfn); 4568 &zone_start_pfn, &zone_end_pfn);
4569 4569
4570 /* Check that this node has pages within the zone's required range */ 4570 /* Check that this node has pages within the zone's required range */
4571 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4571 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4572 return 0; 4572 return 0;
4573 4573
4574 /* Move the zone boundaries inside the node if necessary */ 4574 /* Move the zone boundaries inside the node if necessary */
4575 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4575 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4576 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4576 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4577 4577
4578 /* Return the spanned pages */ 4578 /* Return the spanned pages */
4579 return zone_end_pfn - zone_start_pfn; 4579 return zone_end_pfn - zone_start_pfn;
4580 } 4580 }
4581 4581
4582 /* 4582 /*
4583 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4583 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4584 * then all holes in the requested range will be accounted for. 4584 * then all holes in the requested range will be accounted for.
4585 */ 4585 */
4586 unsigned long __meminit __absent_pages_in_range(int nid, 4586 unsigned long __meminit __absent_pages_in_range(int nid,
4587 unsigned long range_start_pfn, 4587 unsigned long range_start_pfn,
4588 unsigned long range_end_pfn) 4588 unsigned long range_end_pfn)
4589 { 4589 {
4590 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4590 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4591 unsigned long start_pfn, end_pfn; 4591 unsigned long start_pfn, end_pfn;
4592 int i; 4592 int i;
4593 4593
4594 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4594 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4595 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4595 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4596 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4596 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4597 nr_absent -= end_pfn - start_pfn; 4597 nr_absent -= end_pfn - start_pfn;
4598 } 4598 }
4599 return nr_absent; 4599 return nr_absent;
4600 } 4600 }
4601 4601
4602 /** 4602 /**
4603 * absent_pages_in_range - Return number of page frames in holes within a range 4603 * absent_pages_in_range - Return number of page frames in holes within a range
4604 * @start_pfn: The start PFN to start searching for holes 4604 * @start_pfn: The start PFN to start searching for holes
4605 * @end_pfn: The end PFN to stop searching for holes 4605 * @end_pfn: The end PFN to stop searching for holes
4606 * 4606 *
4607 * It returns the number of pages frames in memory holes within a range. 4607 * It returns the number of pages frames in memory holes within a range.
4608 */ 4608 */
4609 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4609 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4610 unsigned long end_pfn) 4610 unsigned long end_pfn)
4611 { 4611 {
4612 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4612 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4613 } 4613 }
4614 4614
4615 /* Return the number of page frames in holes in a zone on a node */ 4615 /* Return the number of page frames in holes in a zone on a node */
4616 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4616 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4617 unsigned long zone_type, 4617 unsigned long zone_type,
4618 unsigned long node_start_pfn, 4618 unsigned long node_start_pfn,
4619 unsigned long node_end_pfn, 4619 unsigned long node_end_pfn,
4620 unsigned long *ignored) 4620 unsigned long *ignored)
4621 { 4621 {
4622 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4622 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4623 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4623 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4624 unsigned long zone_start_pfn, zone_end_pfn; 4624 unsigned long zone_start_pfn, zone_end_pfn;
4625 4625
4626 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4626 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4627 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4627 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4628 4628
4629 adjust_zone_range_for_zone_movable(nid, zone_type, 4629 adjust_zone_range_for_zone_movable(nid, zone_type,
4630 node_start_pfn, node_end_pfn, 4630 node_start_pfn, node_end_pfn,
4631 &zone_start_pfn, &zone_end_pfn); 4631 &zone_start_pfn, &zone_end_pfn);
4632 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4632 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4633 } 4633 }
4634 4634
4635 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4635 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4636 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4636 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4637 unsigned long zone_type, 4637 unsigned long zone_type,
4638 unsigned long node_start_pfn, 4638 unsigned long node_start_pfn,
4639 unsigned long node_end_pfn, 4639 unsigned long node_end_pfn,
4640 unsigned long *zones_size) 4640 unsigned long *zones_size)
4641 { 4641 {
4642 return zones_size[zone_type]; 4642 return zones_size[zone_type];
4643 } 4643 }
4644 4644
4645 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4645 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4646 unsigned long zone_type, 4646 unsigned long zone_type,
4647 unsigned long node_start_pfn, 4647 unsigned long node_start_pfn,
4648 unsigned long node_end_pfn, 4648 unsigned long node_end_pfn,
4649 unsigned long *zholes_size) 4649 unsigned long *zholes_size)
4650 { 4650 {
4651 if (!zholes_size) 4651 if (!zholes_size)
4652 return 0; 4652 return 0;
4653 4653
4654 return zholes_size[zone_type]; 4654 return zholes_size[zone_type];
4655 } 4655 }
4656 4656
4657 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4657 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4658 4658
4659 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4659 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4660 unsigned long node_start_pfn, 4660 unsigned long node_start_pfn,
4661 unsigned long node_end_pfn, 4661 unsigned long node_end_pfn,
4662 unsigned long *zones_size, 4662 unsigned long *zones_size,
4663 unsigned long *zholes_size) 4663 unsigned long *zholes_size)
4664 { 4664 {
4665 unsigned long realtotalpages, totalpages = 0; 4665 unsigned long realtotalpages, totalpages = 0;
4666 enum zone_type i; 4666 enum zone_type i;
4667 4667
4668 for (i = 0; i < MAX_NR_ZONES; i++) 4668 for (i = 0; i < MAX_NR_ZONES; i++)
4669 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4669 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4670 node_start_pfn, 4670 node_start_pfn,
4671 node_end_pfn, 4671 node_end_pfn,
4672 zones_size); 4672 zones_size);
4673 pgdat->node_spanned_pages = totalpages; 4673 pgdat->node_spanned_pages = totalpages;
4674 4674
4675 realtotalpages = totalpages; 4675 realtotalpages = totalpages;
4676 for (i = 0; i < MAX_NR_ZONES; i++) 4676 for (i = 0; i < MAX_NR_ZONES; i++)
4677 realtotalpages -= 4677 realtotalpages -=
4678 zone_absent_pages_in_node(pgdat->node_id, i, 4678 zone_absent_pages_in_node(pgdat->node_id, i,
4679 node_start_pfn, node_end_pfn, 4679 node_start_pfn, node_end_pfn,
4680 zholes_size); 4680 zholes_size);
4681 pgdat->node_present_pages = realtotalpages; 4681 pgdat->node_present_pages = realtotalpages;
4682 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4682 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4683 realtotalpages); 4683 realtotalpages);
4684 } 4684 }
4685 4685
4686 #ifndef CONFIG_SPARSEMEM 4686 #ifndef CONFIG_SPARSEMEM
4687 /* 4687 /*
4688 * Calculate the size of the zone->blockflags rounded to an unsigned long 4688 * Calculate the size of the zone->blockflags rounded to an unsigned long
4689 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4689 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4690 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4690 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4691 * round what is now in bits to nearest long in bits, then return it in 4691 * round what is now in bits to nearest long in bits, then return it in
4692 * bytes. 4692 * bytes.
4693 */ 4693 */
4694 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4694 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4695 { 4695 {
4696 unsigned long usemapsize; 4696 unsigned long usemapsize;
4697 4697
4698 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4698 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4699 usemapsize = roundup(zonesize, pageblock_nr_pages); 4699 usemapsize = roundup(zonesize, pageblock_nr_pages);
4700 usemapsize = usemapsize >> pageblock_order; 4700 usemapsize = usemapsize >> pageblock_order;
4701 usemapsize *= NR_PAGEBLOCK_BITS; 4701 usemapsize *= NR_PAGEBLOCK_BITS;
4702 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4702 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4703 4703
4704 return usemapsize / 8; 4704 return usemapsize / 8;
4705 } 4705 }
4706 4706
4707 static void __init setup_usemap(struct pglist_data *pgdat, 4707 static void __init setup_usemap(struct pglist_data *pgdat,
4708 struct zone *zone, 4708 struct zone *zone,
4709 unsigned long zone_start_pfn, 4709 unsigned long zone_start_pfn,
4710 unsigned long zonesize) 4710 unsigned long zonesize)
4711 { 4711 {
4712 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4712 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4713 zone->pageblock_flags = NULL; 4713 zone->pageblock_flags = NULL;
4714 if (usemapsize) 4714 if (usemapsize)
4715 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4715 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4716 usemapsize); 4716 usemapsize);
4717 } 4717 }
4718 #else 4718 #else
4719 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4719 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4720 unsigned long zone_start_pfn, unsigned long zonesize) {} 4720 unsigned long zone_start_pfn, unsigned long zonesize) {}
4721 #endif /* CONFIG_SPARSEMEM */ 4721 #endif /* CONFIG_SPARSEMEM */
4722 4722
4723 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4723 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4724 4724
4725 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4725 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4726 void __paginginit set_pageblock_order(void) 4726 void __paginginit set_pageblock_order(void)
4727 { 4727 {
4728 unsigned int order; 4728 unsigned int order;
4729 4729
4730 /* Check that pageblock_nr_pages has not already been setup */ 4730 /* Check that pageblock_nr_pages has not already been setup */
4731 if (pageblock_order) 4731 if (pageblock_order)
4732 return; 4732 return;
4733 4733
4734 if (HPAGE_SHIFT > PAGE_SHIFT) 4734 if (HPAGE_SHIFT > PAGE_SHIFT)
4735 order = HUGETLB_PAGE_ORDER; 4735 order = HUGETLB_PAGE_ORDER;
4736 else 4736 else
4737 order = MAX_ORDER - 1; 4737 order = MAX_ORDER - 1;
4738 4738
4739 /* 4739 /*
4740 * Assume the largest contiguous order of interest is a huge page. 4740 * Assume the largest contiguous order of interest is a huge page.
4741 * This value may be variable depending on boot parameters on IA64 and 4741 * This value may be variable depending on boot parameters on IA64 and
4742 * powerpc. 4742 * powerpc.
4743 */ 4743 */
4744 pageblock_order = order; 4744 pageblock_order = order;
4745 } 4745 }
4746 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4746 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4747 4747
4748 /* 4748 /*
4749 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4749 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4750 * is unused as pageblock_order is set at compile-time. See 4750 * is unused as pageblock_order is set at compile-time. See
4751 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4751 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4752 * the kernel config 4752 * the kernel config
4753 */ 4753 */
4754 void __paginginit set_pageblock_order(void) 4754 void __paginginit set_pageblock_order(void)
4755 { 4755 {
4756 } 4756 }
4757 4757
4758 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4758 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4759 4759
4760 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4760 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4761 unsigned long present_pages) 4761 unsigned long present_pages)
4762 { 4762 {
4763 unsigned long pages = spanned_pages; 4763 unsigned long pages = spanned_pages;
4764 4764
4765 /* 4765 /*
4766 * Provide a more accurate estimation if there are holes within 4766 * Provide a more accurate estimation if there are holes within
4767 * the zone and SPARSEMEM is in use. If there are holes within the 4767 * the zone and SPARSEMEM is in use. If there are holes within the
4768 * zone, each populated memory region may cost us one or two extra 4768 * zone, each populated memory region may cost us one or two extra
4769 * memmap pages due to alignment because memmap pages for each 4769 * memmap pages due to alignment because memmap pages for each
4770 * populated regions may not naturally algined on page boundary. 4770 * populated regions may not naturally algined on page boundary.
4771 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4771 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4772 */ 4772 */
4773 if (spanned_pages > present_pages + (present_pages >> 4) && 4773 if (spanned_pages > present_pages + (present_pages >> 4) &&
4774 IS_ENABLED(CONFIG_SPARSEMEM)) 4774 IS_ENABLED(CONFIG_SPARSEMEM))
4775 pages = present_pages; 4775 pages = present_pages;
4776 4776
4777 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4777 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4778 } 4778 }
4779 4779
4780 /* 4780 /*
4781 * Set up the zone data structures: 4781 * Set up the zone data structures:
4782 * - mark all pages reserved 4782 * - mark all pages reserved
4783 * - mark all memory queues empty 4783 * - mark all memory queues empty
4784 * - clear the memory bitmaps 4784 * - clear the memory bitmaps
4785 * 4785 *
4786 * NOTE: pgdat should get zeroed by caller. 4786 * NOTE: pgdat should get zeroed by caller.
4787 */ 4787 */
4788 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4788 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4789 unsigned long node_start_pfn, unsigned long node_end_pfn, 4789 unsigned long node_start_pfn, unsigned long node_end_pfn,
4790 unsigned long *zones_size, unsigned long *zholes_size) 4790 unsigned long *zones_size, unsigned long *zholes_size)
4791 { 4791 {
4792 enum zone_type j; 4792 enum zone_type j;
4793 int nid = pgdat->node_id; 4793 int nid = pgdat->node_id;
4794 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4794 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4795 int ret; 4795 int ret;
4796 4796
4797 pgdat_resize_init(pgdat); 4797 pgdat_resize_init(pgdat);
4798 #ifdef CONFIG_NUMA_BALANCING 4798 #ifdef CONFIG_NUMA_BALANCING
4799 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4799 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4800 pgdat->numabalancing_migrate_nr_pages = 0; 4800 pgdat->numabalancing_migrate_nr_pages = 0;
4801 pgdat->numabalancing_migrate_next_window = jiffies; 4801 pgdat->numabalancing_migrate_next_window = jiffies;
4802 #endif 4802 #endif
4803 init_waitqueue_head(&pgdat->kswapd_wait); 4803 init_waitqueue_head(&pgdat->kswapd_wait);
4804 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4804 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4805 pgdat_page_cgroup_init(pgdat); 4805 pgdat_page_cgroup_init(pgdat);
4806 4806
4807 for (j = 0; j < MAX_NR_ZONES; j++) { 4807 for (j = 0; j < MAX_NR_ZONES; j++) {
4808 struct zone *zone = pgdat->node_zones + j; 4808 struct zone *zone = pgdat->node_zones + j;
4809 unsigned long size, realsize, freesize, memmap_pages; 4809 unsigned long size, realsize, freesize, memmap_pages;
4810 4810
4811 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4811 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4812 node_end_pfn, zones_size); 4812 node_end_pfn, zones_size);
4813 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4813 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4814 node_start_pfn, 4814 node_start_pfn,
4815 node_end_pfn, 4815 node_end_pfn,
4816 zholes_size); 4816 zholes_size);
4817 4817
4818 /* 4818 /*
4819 * Adjust freesize so that it accounts for how much memory 4819 * Adjust freesize so that it accounts for how much memory
4820 * is used by this zone for memmap. This affects the watermark 4820 * is used by this zone for memmap. This affects the watermark
4821 * and per-cpu initialisations 4821 * and per-cpu initialisations
4822 */ 4822 */
4823 memmap_pages = calc_memmap_size(size, realsize); 4823 memmap_pages = calc_memmap_size(size, realsize);
4824 if (freesize >= memmap_pages) { 4824 if (freesize >= memmap_pages) {
4825 freesize -= memmap_pages; 4825 freesize -= memmap_pages;
4826 if (memmap_pages) 4826 if (memmap_pages)
4827 printk(KERN_DEBUG 4827 printk(KERN_DEBUG
4828 " %s zone: %lu pages used for memmap\n", 4828 " %s zone: %lu pages used for memmap\n",
4829 zone_names[j], memmap_pages); 4829 zone_names[j], memmap_pages);
4830 } else 4830 } else
4831 printk(KERN_WARNING 4831 printk(KERN_WARNING
4832 " %s zone: %lu pages exceeds freesize %lu\n", 4832 " %s zone: %lu pages exceeds freesize %lu\n",
4833 zone_names[j], memmap_pages, freesize); 4833 zone_names[j], memmap_pages, freesize);
4834 4834
4835 /* Account for reserved pages */ 4835 /* Account for reserved pages */
4836 if (j == 0 && freesize > dma_reserve) { 4836 if (j == 0 && freesize > dma_reserve) {
4837 freesize -= dma_reserve; 4837 freesize -= dma_reserve;
4838 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4838 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4839 zone_names[0], dma_reserve); 4839 zone_names[0], dma_reserve);
4840 } 4840 }
4841 4841
4842 if (!is_highmem_idx(j)) 4842 if (!is_highmem_idx(j))
4843 nr_kernel_pages += freesize; 4843 nr_kernel_pages += freesize;
4844 /* Charge for highmem memmap if there are enough kernel pages */ 4844 /* Charge for highmem memmap if there are enough kernel pages */
4845 else if (nr_kernel_pages > memmap_pages * 2) 4845 else if (nr_kernel_pages > memmap_pages * 2)
4846 nr_kernel_pages -= memmap_pages; 4846 nr_kernel_pages -= memmap_pages;
4847 nr_all_pages += freesize; 4847 nr_all_pages += freesize;
4848 4848
4849 zone->spanned_pages = size; 4849 zone->spanned_pages = size;
4850 zone->present_pages = realsize; 4850 zone->present_pages = realsize;
4851 /* 4851 /*
4852 * Set an approximate value for lowmem here, it will be adjusted 4852 * Set an approximate value for lowmem here, it will be adjusted
4853 * when the bootmem allocator frees pages into the buddy system. 4853 * when the bootmem allocator frees pages into the buddy system.
4854 * And all highmem pages will be managed by the buddy system. 4854 * And all highmem pages will be managed by the buddy system.
4855 */ 4855 */
4856 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4856 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4857 #ifdef CONFIG_NUMA 4857 #ifdef CONFIG_NUMA
4858 zone->node = nid; 4858 zone->node = nid;
4859 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4859 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4860 / 100; 4860 / 100;
4861 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4861 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4862 #endif 4862 #endif
4863 zone->name = zone_names[j]; 4863 zone->name = zone_names[j];
4864 spin_lock_init(&zone->lock); 4864 spin_lock_init(&zone->lock);
4865 spin_lock_init(&zone->lru_lock); 4865 spin_lock_init(&zone->lru_lock);
4866 zone_seqlock_init(zone); 4866 zone_seqlock_init(zone);
4867 zone->zone_pgdat = pgdat; 4867 zone->zone_pgdat = pgdat;
4868 zone_pcp_init(zone); 4868 zone_pcp_init(zone);
4869 4869
4870 /* For bootup, initialized properly in watermark setup */ 4870 /* For bootup, initialized properly in watermark setup */
4871 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4871 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4872 4872
4873 lruvec_init(&zone->lruvec); 4873 lruvec_init(&zone->lruvec);
4874 if (!size) 4874 if (!size)
4875 continue; 4875 continue;
4876 4876
4877 set_pageblock_order(); 4877 set_pageblock_order();
4878 setup_usemap(pgdat, zone, zone_start_pfn, size); 4878 setup_usemap(pgdat, zone, zone_start_pfn, size);
4879 ret = init_currently_empty_zone(zone, zone_start_pfn, 4879 ret = init_currently_empty_zone(zone, zone_start_pfn,
4880 size, MEMMAP_EARLY); 4880 size, MEMMAP_EARLY);
4881 BUG_ON(ret); 4881 BUG_ON(ret);
4882 memmap_init(size, nid, j, zone_start_pfn); 4882 memmap_init(size, nid, j, zone_start_pfn);
4883 zone_start_pfn += size; 4883 zone_start_pfn += size;
4884 } 4884 }
4885 } 4885 }
4886 4886
4887 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4887 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4888 { 4888 {
4889 /* Skip empty nodes */ 4889 /* Skip empty nodes */
4890 if (!pgdat->node_spanned_pages) 4890 if (!pgdat->node_spanned_pages)
4891 return; 4891 return;
4892 4892
4893 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4893 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4894 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4894 /* ia64 gets its own node_mem_map, before this, without bootmem */
4895 if (!pgdat->node_mem_map) { 4895 if (!pgdat->node_mem_map) {
4896 unsigned long size, start, end; 4896 unsigned long size, start, end;
4897 struct page *map; 4897 struct page *map;
4898 4898
4899 /* 4899 /*
4900 * The zone's endpoints aren't required to be MAX_ORDER 4900 * The zone's endpoints aren't required to be MAX_ORDER
4901 * aligned but the node_mem_map endpoints must be in order 4901 * aligned but the node_mem_map endpoints must be in order
4902 * for the buddy allocator to function correctly. 4902 * for the buddy allocator to function correctly.
4903 */ 4903 */
4904 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4904 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4905 end = pgdat_end_pfn(pgdat); 4905 end = pgdat_end_pfn(pgdat);
4906 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4906 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4907 size = (end - start) * sizeof(struct page); 4907 size = (end - start) * sizeof(struct page);
4908 map = alloc_remap(pgdat->node_id, size); 4908 map = alloc_remap(pgdat->node_id, size);
4909 if (!map) 4909 if (!map)
4910 map = alloc_bootmem_node_nopanic(pgdat, size); 4910 map = alloc_bootmem_node_nopanic(pgdat, size);
4911 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4911 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4912 } 4912 }
4913 #ifndef CONFIG_NEED_MULTIPLE_NODES 4913 #ifndef CONFIG_NEED_MULTIPLE_NODES
4914 /* 4914 /*
4915 * With no DISCONTIG, the global mem_map is just set as node 0's 4915 * With no DISCONTIG, the global mem_map is just set as node 0's
4916 */ 4916 */
4917 if (pgdat == NODE_DATA(0)) { 4917 if (pgdat == NODE_DATA(0)) {
4918 mem_map = NODE_DATA(0)->node_mem_map; 4918 mem_map = NODE_DATA(0)->node_mem_map;
4919 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4919 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4920 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4920 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4921 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4921 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4922 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4922 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4923 } 4923 }
4924 #endif 4924 #endif
4925 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4925 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4926 } 4926 }
4927 4927
4928 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4928 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4929 unsigned long node_start_pfn, unsigned long *zholes_size) 4929 unsigned long node_start_pfn, unsigned long *zholes_size)
4930 { 4930 {
4931 pg_data_t *pgdat = NODE_DATA(nid); 4931 pg_data_t *pgdat = NODE_DATA(nid);
4932 unsigned long start_pfn = 0; 4932 unsigned long start_pfn = 0;
4933 unsigned long end_pfn = 0; 4933 unsigned long end_pfn = 0;
4934 4934
4935 /* pg_data_t should be reset to zero when it's allocated */ 4935 /* pg_data_t should be reset to zero when it's allocated */
4936 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4936 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4937 4937
4938 pgdat->node_id = nid; 4938 pgdat->node_id = nid;
4939 pgdat->node_start_pfn = node_start_pfn; 4939 pgdat->node_start_pfn = node_start_pfn;
4940 if (node_state(nid, N_MEMORY)) 4940 if (node_state(nid, N_MEMORY))
4941 init_zone_allows_reclaim(nid); 4941 init_zone_allows_reclaim(nid);
4942 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4942 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4943 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4943 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4944 #endif 4944 #endif
4945 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4945 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4946 zones_size, zholes_size); 4946 zones_size, zholes_size);
4947 4947
4948 alloc_node_mem_map(pgdat); 4948 alloc_node_mem_map(pgdat);
4949 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4949 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4950 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4950 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4951 nid, (unsigned long)pgdat, 4951 nid, (unsigned long)pgdat,
4952 (unsigned long)pgdat->node_mem_map); 4952 (unsigned long)pgdat->node_mem_map);
4953 #endif 4953 #endif
4954 4954
4955 free_area_init_core(pgdat, start_pfn, end_pfn, 4955 free_area_init_core(pgdat, start_pfn, end_pfn,
4956 zones_size, zholes_size); 4956 zones_size, zholes_size);
4957 } 4957 }
4958 4958
4959 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4959 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4960 4960
4961 #if MAX_NUMNODES > 1 4961 #if MAX_NUMNODES > 1
4962 /* 4962 /*
4963 * Figure out the number of possible node ids. 4963 * Figure out the number of possible node ids.
4964 */ 4964 */
4965 void __init setup_nr_node_ids(void) 4965 void __init setup_nr_node_ids(void)
4966 { 4966 {
4967 unsigned int node; 4967 unsigned int node;
4968 unsigned int highest = 0; 4968 unsigned int highest = 0;
4969 4969
4970 for_each_node_mask(node, node_possible_map) 4970 for_each_node_mask(node, node_possible_map)
4971 highest = node; 4971 highest = node;
4972 nr_node_ids = highest + 1; 4972 nr_node_ids = highest + 1;
4973 } 4973 }
4974 #endif 4974 #endif
4975 4975
4976 /** 4976 /**
4977 * node_map_pfn_alignment - determine the maximum internode alignment 4977 * node_map_pfn_alignment - determine the maximum internode alignment
4978 * 4978 *
4979 * This function should be called after node map is populated and sorted. 4979 * This function should be called after node map is populated and sorted.
4980 * It calculates the maximum power of two alignment which can distinguish 4980 * It calculates the maximum power of two alignment which can distinguish
4981 * all the nodes. 4981 * all the nodes.
4982 * 4982 *
4983 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4983 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4984 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4984 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4985 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4985 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4986 * shifted, 1GiB is enough and this function will indicate so. 4986 * shifted, 1GiB is enough and this function will indicate so.
4987 * 4987 *
4988 * This is used to test whether pfn -> nid mapping of the chosen memory 4988 * This is used to test whether pfn -> nid mapping of the chosen memory
4989 * model has fine enough granularity to avoid incorrect mapping for the 4989 * model has fine enough granularity to avoid incorrect mapping for the
4990 * populated node map. 4990 * populated node map.
4991 * 4991 *
4992 * Returns the determined alignment in pfn's. 0 if there is no alignment 4992 * Returns the determined alignment in pfn's. 0 if there is no alignment
4993 * requirement (single node). 4993 * requirement (single node).
4994 */ 4994 */
4995 unsigned long __init node_map_pfn_alignment(void) 4995 unsigned long __init node_map_pfn_alignment(void)
4996 { 4996 {
4997 unsigned long accl_mask = 0, last_end = 0; 4997 unsigned long accl_mask = 0, last_end = 0;
4998 unsigned long start, end, mask; 4998 unsigned long start, end, mask;
4999 int last_nid = -1; 4999 int last_nid = -1;
5000 int i, nid; 5000 int i, nid;
5001 5001
5002 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 5002 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
5003 if (!start || last_nid < 0 || last_nid == nid) { 5003 if (!start || last_nid < 0 || last_nid == nid) {
5004 last_nid = nid; 5004 last_nid = nid;
5005 last_end = end; 5005 last_end = end;
5006 continue; 5006 continue;
5007 } 5007 }
5008 5008
5009 /* 5009 /*
5010 * Start with a mask granular enough to pin-point to the 5010 * Start with a mask granular enough to pin-point to the
5011 * start pfn and tick off bits one-by-one until it becomes 5011 * start pfn and tick off bits one-by-one until it becomes
5012 * too coarse to separate the current node from the last. 5012 * too coarse to separate the current node from the last.
5013 */ 5013 */
5014 mask = ~((1 << __ffs(start)) - 1); 5014 mask = ~((1 << __ffs(start)) - 1);
5015 while (mask && last_end <= (start & (mask << 1))) 5015 while (mask && last_end <= (start & (mask << 1)))
5016 mask <<= 1; 5016 mask <<= 1;
5017 5017
5018 /* accumulate all internode masks */ 5018 /* accumulate all internode masks */
5019 accl_mask |= mask; 5019 accl_mask |= mask;
5020 } 5020 }
5021 5021
5022 /* convert mask to number of pages */ 5022 /* convert mask to number of pages */
5023 return ~accl_mask + 1; 5023 return ~accl_mask + 1;
5024 } 5024 }
5025 5025
5026 /* Find the lowest pfn for a node */ 5026 /* Find the lowest pfn for a node */
5027 static unsigned long __init find_min_pfn_for_node(int nid) 5027 static unsigned long __init find_min_pfn_for_node(int nid)
5028 { 5028 {
5029 unsigned long min_pfn = ULONG_MAX; 5029 unsigned long min_pfn = ULONG_MAX;
5030 unsigned long start_pfn; 5030 unsigned long start_pfn;
5031 int i; 5031 int i;
5032 5032
5033 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5033 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
5034 min_pfn = min(min_pfn, start_pfn); 5034 min_pfn = min(min_pfn, start_pfn);
5035 5035
5036 if (min_pfn == ULONG_MAX) { 5036 if (min_pfn == ULONG_MAX) {
5037 printk(KERN_WARNING 5037 printk(KERN_WARNING
5038 "Could not find start_pfn for node %d\n", nid); 5038 "Could not find start_pfn for node %d\n", nid);
5039 return 0; 5039 return 0;
5040 } 5040 }
5041 5041
5042 return min_pfn; 5042 return min_pfn;
5043 } 5043 }
5044 5044
5045 /** 5045 /**
5046 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5046 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5047 * 5047 *
5048 * It returns the minimum PFN based on information provided via 5048 * It returns the minimum PFN based on information provided via
5049 * add_active_range(). 5049 * add_active_range().
5050 */ 5050 */
5051 unsigned long __init find_min_pfn_with_active_regions(void) 5051 unsigned long __init find_min_pfn_with_active_regions(void)
5052 { 5052 {
5053 return find_min_pfn_for_node(MAX_NUMNODES); 5053 return find_min_pfn_for_node(MAX_NUMNODES);
5054 } 5054 }
5055 5055
5056 /* 5056 /*
5057 * early_calculate_totalpages() 5057 * early_calculate_totalpages()
5058 * Sum pages in active regions for movable zone. 5058 * Sum pages in active regions for movable zone.
5059 * Populate N_MEMORY for calculating usable_nodes. 5059 * Populate N_MEMORY for calculating usable_nodes.
5060 */ 5060 */
5061 static unsigned long __init early_calculate_totalpages(void) 5061 static unsigned long __init early_calculate_totalpages(void)
5062 { 5062 {
5063 unsigned long totalpages = 0; 5063 unsigned long totalpages = 0;
5064 unsigned long start_pfn, end_pfn; 5064 unsigned long start_pfn, end_pfn;
5065 int i, nid; 5065 int i, nid;
5066 5066
5067 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5067 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5068 unsigned long pages = end_pfn - start_pfn; 5068 unsigned long pages = end_pfn - start_pfn;
5069 5069
5070 totalpages += pages; 5070 totalpages += pages;
5071 if (pages) 5071 if (pages)
5072 node_set_state(nid, N_MEMORY); 5072 node_set_state(nid, N_MEMORY);
5073 } 5073 }
5074 return totalpages; 5074 return totalpages;
5075 } 5075 }
5076 5076
5077 /* 5077 /*
5078 * Find the PFN the Movable zone begins in each node. Kernel memory 5078 * Find the PFN the Movable zone begins in each node. Kernel memory
5079 * is spread evenly between nodes as long as the nodes have enough 5079 * is spread evenly between nodes as long as the nodes have enough
5080 * memory. When they don't, some nodes will have more kernelcore than 5080 * memory. When they don't, some nodes will have more kernelcore than
5081 * others 5081 * others
5082 */ 5082 */
5083 static void __init find_zone_movable_pfns_for_nodes(void) 5083 static void __init find_zone_movable_pfns_for_nodes(void)
5084 { 5084 {
5085 int i, nid; 5085 int i, nid;
5086 unsigned long usable_startpfn; 5086 unsigned long usable_startpfn;
5087 unsigned long kernelcore_node, kernelcore_remaining; 5087 unsigned long kernelcore_node, kernelcore_remaining;
5088 /* save the state before borrow the nodemask */ 5088 /* save the state before borrow the nodemask */
5089 nodemask_t saved_node_state = node_states[N_MEMORY]; 5089 nodemask_t saved_node_state = node_states[N_MEMORY];
5090 unsigned long totalpages = early_calculate_totalpages(); 5090 unsigned long totalpages = early_calculate_totalpages();
5091 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5091 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5092 5092
5093 /* 5093 /*
5094 * If movablecore was specified, calculate what size of 5094 * If movablecore was specified, calculate what size of
5095 * kernelcore that corresponds so that memory usable for 5095 * kernelcore that corresponds so that memory usable for
5096 * any allocation type is evenly spread. If both kernelcore 5096 * any allocation type is evenly spread. If both kernelcore
5097 * and movablecore are specified, then the value of kernelcore 5097 * and movablecore are specified, then the value of kernelcore
5098 * will be used for required_kernelcore if it's greater than 5098 * will be used for required_kernelcore if it's greater than
5099 * what movablecore would have allowed. 5099 * what movablecore would have allowed.
5100 */ 5100 */
5101 if (required_movablecore) { 5101 if (required_movablecore) {
5102 unsigned long corepages; 5102 unsigned long corepages;
5103 5103
5104 /* 5104 /*
5105 * Round-up so that ZONE_MOVABLE is at least as large as what 5105 * Round-up so that ZONE_MOVABLE is at least as large as what
5106 * was requested by the user 5106 * was requested by the user
5107 */ 5107 */
5108 required_movablecore = 5108 required_movablecore =
5109 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5109 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5110 corepages = totalpages - required_movablecore; 5110 corepages = totalpages - required_movablecore;
5111 5111
5112 required_kernelcore = max(required_kernelcore, corepages); 5112 required_kernelcore = max(required_kernelcore, corepages);
5113 } 5113 }
5114 5114
5115 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5115 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5116 if (!required_kernelcore) 5116 if (!required_kernelcore)
5117 goto out; 5117 goto out;
5118 5118
5119 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5119 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5120 find_usable_zone_for_movable(); 5120 find_usable_zone_for_movable();
5121 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5121 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5122 5122
5123 restart: 5123 restart:
5124 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5124 /* Spread kernelcore memory as evenly as possible throughout nodes */
5125 kernelcore_node = required_kernelcore / usable_nodes; 5125 kernelcore_node = required_kernelcore / usable_nodes;
5126 for_each_node_state(nid, N_MEMORY) { 5126 for_each_node_state(nid, N_MEMORY) {
5127 unsigned long start_pfn, end_pfn; 5127 unsigned long start_pfn, end_pfn;
5128 5128
5129 /* 5129 /*
5130 * Recalculate kernelcore_node if the division per node 5130 * Recalculate kernelcore_node if the division per node
5131 * now exceeds what is necessary to satisfy the requested 5131 * now exceeds what is necessary to satisfy the requested
5132 * amount of memory for the kernel 5132 * amount of memory for the kernel
5133 */ 5133 */
5134 if (required_kernelcore < kernelcore_node) 5134 if (required_kernelcore < kernelcore_node)
5135 kernelcore_node = required_kernelcore / usable_nodes; 5135 kernelcore_node = required_kernelcore / usable_nodes;
5136 5136
5137 /* 5137 /*
5138 * As the map is walked, we track how much memory is usable 5138 * As the map is walked, we track how much memory is usable
5139 * by the kernel using kernelcore_remaining. When it is 5139 * by the kernel using kernelcore_remaining. When it is
5140 * 0, the rest of the node is usable by ZONE_MOVABLE 5140 * 0, the rest of the node is usable by ZONE_MOVABLE
5141 */ 5141 */
5142 kernelcore_remaining = kernelcore_node; 5142 kernelcore_remaining = kernelcore_node;
5143 5143
5144 /* Go through each range of PFNs within this node */ 5144 /* Go through each range of PFNs within this node */
5145 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5145 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5146 unsigned long size_pages; 5146 unsigned long size_pages;
5147 5147
5148 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5148 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5149 if (start_pfn >= end_pfn) 5149 if (start_pfn >= end_pfn)
5150 continue; 5150 continue;
5151 5151
5152 /* Account for what is only usable for kernelcore */ 5152 /* Account for what is only usable for kernelcore */
5153 if (start_pfn < usable_startpfn) { 5153 if (start_pfn < usable_startpfn) {
5154 unsigned long kernel_pages; 5154 unsigned long kernel_pages;
5155 kernel_pages = min(end_pfn, usable_startpfn) 5155 kernel_pages = min(end_pfn, usable_startpfn)
5156 - start_pfn; 5156 - start_pfn;
5157 5157
5158 kernelcore_remaining -= min(kernel_pages, 5158 kernelcore_remaining -= min(kernel_pages,
5159 kernelcore_remaining); 5159 kernelcore_remaining);
5160 required_kernelcore -= min(kernel_pages, 5160 required_kernelcore -= min(kernel_pages,
5161 required_kernelcore); 5161 required_kernelcore);
5162 5162
5163 /* Continue if range is now fully accounted */ 5163 /* Continue if range is now fully accounted */
5164 if (end_pfn <= usable_startpfn) { 5164 if (end_pfn <= usable_startpfn) {
5165 5165
5166 /* 5166 /*
5167 * Push zone_movable_pfn to the end so 5167 * Push zone_movable_pfn to the end so
5168 * that if we have to rebalance 5168 * that if we have to rebalance
5169 * kernelcore across nodes, we will 5169 * kernelcore across nodes, we will
5170 * not double account here 5170 * not double account here
5171 */ 5171 */
5172 zone_movable_pfn[nid] = end_pfn; 5172 zone_movable_pfn[nid] = end_pfn;
5173 continue; 5173 continue;
5174 } 5174 }
5175 start_pfn = usable_startpfn; 5175 start_pfn = usable_startpfn;
5176 } 5176 }
5177 5177
5178 /* 5178 /*
5179 * The usable PFN range for ZONE_MOVABLE is from 5179 * The usable PFN range for ZONE_MOVABLE is from
5180 * start_pfn->end_pfn. Calculate size_pages as the 5180 * start_pfn->end_pfn. Calculate size_pages as the
5181 * number of pages used as kernelcore 5181 * number of pages used as kernelcore
5182 */ 5182 */
5183 size_pages = end_pfn - start_pfn; 5183 size_pages = end_pfn - start_pfn;
5184 if (size_pages > kernelcore_remaining) 5184 if (size_pages > kernelcore_remaining)
5185 size_pages = kernelcore_remaining; 5185 size_pages = kernelcore_remaining;
5186 zone_movable_pfn[nid] = start_pfn + size_pages; 5186 zone_movable_pfn[nid] = start_pfn + size_pages;
5187 5187
5188 /* 5188 /*
5189 * Some kernelcore has been met, update counts and 5189 * Some kernelcore has been met, update counts and
5190 * break if the kernelcore for this node has been 5190 * break if the kernelcore for this node has been
5191 * satisfied 5191 * satisfied
5192 */ 5192 */
5193 required_kernelcore -= min(required_kernelcore, 5193 required_kernelcore -= min(required_kernelcore,
5194 size_pages); 5194 size_pages);
5195 kernelcore_remaining -= size_pages; 5195 kernelcore_remaining -= size_pages;
5196 if (!kernelcore_remaining) 5196 if (!kernelcore_remaining)
5197 break; 5197 break;
5198 } 5198 }
5199 } 5199 }
5200 5200
5201 /* 5201 /*
5202 * If there is still required_kernelcore, we do another pass with one 5202 * If there is still required_kernelcore, we do another pass with one
5203 * less node in the count. This will push zone_movable_pfn[nid] further 5203 * less node in the count. This will push zone_movable_pfn[nid] further
5204 * along on the nodes that still have memory until kernelcore is 5204 * along on the nodes that still have memory until kernelcore is
5205 * satisfied 5205 * satisfied
5206 */ 5206 */
5207 usable_nodes--; 5207 usable_nodes--;
5208 if (usable_nodes && required_kernelcore > usable_nodes) 5208 if (usable_nodes && required_kernelcore > usable_nodes)
5209 goto restart; 5209 goto restart;
5210 5210
5211 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5211 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5212 for (nid = 0; nid < MAX_NUMNODES; nid++) 5212 for (nid = 0; nid < MAX_NUMNODES; nid++)
5213 zone_movable_pfn[nid] = 5213 zone_movable_pfn[nid] =
5214 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5214 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5215 5215
5216 out: 5216 out:
5217 /* restore the node_state */ 5217 /* restore the node_state */
5218 node_states[N_MEMORY] = saved_node_state; 5218 node_states[N_MEMORY] = saved_node_state;
5219 } 5219 }
5220 5220
5221 /* Any regular or high memory on that node ? */ 5221 /* Any regular or high memory on that node ? */
5222 static void check_for_memory(pg_data_t *pgdat, int nid) 5222 static void check_for_memory(pg_data_t *pgdat, int nid)
5223 { 5223 {
5224 enum zone_type zone_type; 5224 enum zone_type zone_type;
5225 5225
5226 if (N_MEMORY == N_NORMAL_MEMORY) 5226 if (N_MEMORY == N_NORMAL_MEMORY)
5227 return; 5227 return;
5228 5228
5229 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5229 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5230 struct zone *zone = &pgdat->node_zones[zone_type]; 5230 struct zone *zone = &pgdat->node_zones[zone_type];
5231 if (zone->present_pages) { 5231 if (zone->present_pages) {
5232 node_set_state(nid, N_HIGH_MEMORY); 5232 node_set_state(nid, N_HIGH_MEMORY);
5233 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5233 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5234 zone_type <= ZONE_NORMAL) 5234 zone_type <= ZONE_NORMAL)
5235 node_set_state(nid, N_NORMAL_MEMORY); 5235 node_set_state(nid, N_NORMAL_MEMORY);
5236 break; 5236 break;
5237 } 5237 }
5238 } 5238 }
5239 } 5239 }
5240 5240
5241 /** 5241 /**
5242 * free_area_init_nodes - Initialise all pg_data_t and zone data 5242 * free_area_init_nodes - Initialise all pg_data_t and zone data
5243 * @max_zone_pfn: an array of max PFNs for each zone 5243 * @max_zone_pfn: an array of max PFNs for each zone
5244 * 5244 *
5245 * This will call free_area_init_node() for each active node in the system. 5245 * This will call free_area_init_node() for each active node in the system.
5246 * Using the page ranges provided by add_active_range(), the size of each 5246 * Using the page ranges provided by add_active_range(), the size of each
5247 * zone in each node and their holes is calculated. If the maximum PFN 5247 * zone in each node and their holes is calculated. If the maximum PFN
5248 * between two adjacent zones match, it is assumed that the zone is empty. 5248 * between two adjacent zones match, it is assumed that the zone is empty.
5249 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5249 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5250 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5250 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5251 * starts where the previous one ended. For example, ZONE_DMA32 starts 5251 * starts where the previous one ended. For example, ZONE_DMA32 starts
5252 * at arch_max_dma_pfn. 5252 * at arch_max_dma_pfn.
5253 */ 5253 */
5254 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5254 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5255 { 5255 {
5256 unsigned long start_pfn, end_pfn; 5256 unsigned long start_pfn, end_pfn;
5257 int i, nid; 5257 int i, nid;
5258 5258
5259 /* Record where the zone boundaries are */ 5259 /* Record where the zone boundaries are */
5260 memset(arch_zone_lowest_possible_pfn, 0, 5260 memset(arch_zone_lowest_possible_pfn, 0,
5261 sizeof(arch_zone_lowest_possible_pfn)); 5261 sizeof(arch_zone_lowest_possible_pfn));
5262 memset(arch_zone_highest_possible_pfn, 0, 5262 memset(arch_zone_highest_possible_pfn, 0,
5263 sizeof(arch_zone_highest_possible_pfn)); 5263 sizeof(arch_zone_highest_possible_pfn));
5264 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5264 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5265 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5265 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5266 for (i = 1; i < MAX_NR_ZONES; i++) { 5266 for (i = 1; i < MAX_NR_ZONES; i++) {
5267 if (i == ZONE_MOVABLE) 5267 if (i == ZONE_MOVABLE)
5268 continue; 5268 continue;
5269 arch_zone_lowest_possible_pfn[i] = 5269 arch_zone_lowest_possible_pfn[i] =
5270 arch_zone_highest_possible_pfn[i-1]; 5270 arch_zone_highest_possible_pfn[i-1];
5271 arch_zone_highest_possible_pfn[i] = 5271 arch_zone_highest_possible_pfn[i] =
5272 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5272 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5273 } 5273 }
5274 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5274 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5275 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5275 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5276 5276
5277 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5277 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5278 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5278 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5279 find_zone_movable_pfns_for_nodes(); 5279 find_zone_movable_pfns_for_nodes();
5280 5280
5281 /* Print out the zone ranges */ 5281 /* Print out the zone ranges */
5282 printk("Zone ranges:\n"); 5282 printk("Zone ranges:\n");
5283 for (i = 0; i < MAX_NR_ZONES; i++) { 5283 for (i = 0; i < MAX_NR_ZONES; i++) {
5284 if (i == ZONE_MOVABLE) 5284 if (i == ZONE_MOVABLE)
5285 continue; 5285 continue;
5286 printk(KERN_CONT " %-8s ", zone_names[i]); 5286 printk(KERN_CONT " %-8s ", zone_names[i]);
5287 if (arch_zone_lowest_possible_pfn[i] == 5287 if (arch_zone_lowest_possible_pfn[i] ==
5288 arch_zone_highest_possible_pfn[i]) 5288 arch_zone_highest_possible_pfn[i])
5289 printk(KERN_CONT "empty\n"); 5289 printk(KERN_CONT "empty\n");
5290 else 5290 else
5291 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5291 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5292 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5292 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5293 (arch_zone_highest_possible_pfn[i] 5293 (arch_zone_highest_possible_pfn[i]
5294 << PAGE_SHIFT) - 1); 5294 << PAGE_SHIFT) - 1);
5295 } 5295 }
5296 5296
5297 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5297 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5298 printk("Movable zone start for each node\n"); 5298 printk("Movable zone start for each node\n");
5299 for (i = 0; i < MAX_NUMNODES; i++) { 5299 for (i = 0; i < MAX_NUMNODES; i++) {
5300 if (zone_movable_pfn[i]) 5300 if (zone_movable_pfn[i])
5301 printk(" Node %d: %#010lx\n", i, 5301 printk(" Node %d: %#010lx\n", i,
5302 zone_movable_pfn[i] << PAGE_SHIFT); 5302 zone_movable_pfn[i] << PAGE_SHIFT);
5303 } 5303 }
5304 5304
5305 /* Print out the early node map */ 5305 /* Print out the early node map */
5306 printk("Early memory node ranges\n"); 5306 printk("Early memory node ranges\n");
5307 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5307 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5308 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5308 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5309 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5309 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5310 5310
5311 /* Initialise every node */ 5311 /* Initialise every node */
5312 mminit_verify_pageflags_layout(); 5312 mminit_verify_pageflags_layout();
5313 setup_nr_node_ids(); 5313 setup_nr_node_ids();
5314 for_each_online_node(nid) { 5314 for_each_online_node(nid) {
5315 pg_data_t *pgdat = NODE_DATA(nid); 5315 pg_data_t *pgdat = NODE_DATA(nid);
5316 free_area_init_node(nid, NULL, 5316 free_area_init_node(nid, NULL,
5317 find_min_pfn_for_node(nid), NULL); 5317 find_min_pfn_for_node(nid), NULL);
5318 5318
5319 /* Any memory on that node */ 5319 /* Any memory on that node */
5320 if (pgdat->node_present_pages) 5320 if (pgdat->node_present_pages)
5321 node_set_state(nid, N_MEMORY); 5321 node_set_state(nid, N_MEMORY);
5322 check_for_memory(pgdat, nid); 5322 check_for_memory(pgdat, nid);
5323 } 5323 }
5324 } 5324 }
5325 5325
5326 static int __init cmdline_parse_core(char *p, unsigned long *core) 5326 static int __init cmdline_parse_core(char *p, unsigned long *core)
5327 { 5327 {
5328 unsigned long long coremem; 5328 unsigned long long coremem;
5329 if (!p) 5329 if (!p)
5330 return -EINVAL; 5330 return -EINVAL;
5331 5331
5332 coremem = memparse(p, &p); 5332 coremem = memparse(p, &p);
5333 *core = coremem >> PAGE_SHIFT; 5333 *core = coremem >> PAGE_SHIFT;
5334 5334
5335 /* Paranoid check that UL is enough for the coremem value */ 5335 /* Paranoid check that UL is enough for the coremem value */
5336 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5336 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5337 5337
5338 return 0; 5338 return 0;
5339 } 5339 }
5340 5340
5341 /* 5341 /*
5342 * kernelcore=size sets the amount of memory for use for allocations that 5342 * kernelcore=size sets the amount of memory for use for allocations that
5343 * cannot be reclaimed or migrated. 5343 * cannot be reclaimed or migrated.
5344 */ 5344 */
5345 static int __init cmdline_parse_kernelcore(char *p) 5345 static int __init cmdline_parse_kernelcore(char *p)
5346 { 5346 {
5347 return cmdline_parse_core(p, &required_kernelcore); 5347 return cmdline_parse_core(p, &required_kernelcore);
5348 } 5348 }
5349 5349
5350 /* 5350 /*
5351 * movablecore=size sets the amount of memory for use for allocations that 5351 * movablecore=size sets the amount of memory for use for allocations that
5352 * can be reclaimed or migrated. 5352 * can be reclaimed or migrated.
5353 */ 5353 */
5354 static int __init cmdline_parse_movablecore(char *p) 5354 static int __init cmdline_parse_movablecore(char *p)
5355 { 5355 {
5356 return cmdline_parse_core(p, &required_movablecore); 5356 return cmdline_parse_core(p, &required_movablecore);
5357 } 5357 }
5358 5358
5359 early_param("kernelcore", cmdline_parse_kernelcore); 5359 early_param("kernelcore", cmdline_parse_kernelcore);
5360 early_param("movablecore", cmdline_parse_movablecore); 5360 early_param("movablecore", cmdline_parse_movablecore);
5361 5361
5362 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5362 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5363 5363
5364 void adjust_managed_page_count(struct page *page, long count) 5364 void adjust_managed_page_count(struct page *page, long count)
5365 { 5365 {
5366 spin_lock(&managed_page_count_lock); 5366 spin_lock(&managed_page_count_lock);
5367 page_zone(page)->managed_pages += count; 5367 page_zone(page)->managed_pages += count;
5368 totalram_pages += count; 5368 totalram_pages += count;
5369 #ifdef CONFIG_HIGHMEM 5369 #ifdef CONFIG_HIGHMEM
5370 if (PageHighMem(page)) 5370 if (PageHighMem(page))
5371 totalhigh_pages += count; 5371 totalhigh_pages += count;
5372 #endif 5372 #endif
5373 spin_unlock(&managed_page_count_lock); 5373 spin_unlock(&managed_page_count_lock);
5374 } 5374 }
5375 EXPORT_SYMBOL(adjust_managed_page_count); 5375 EXPORT_SYMBOL(adjust_managed_page_count);
5376 5376
5377 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5377 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5378 { 5378 {
5379 void *pos; 5379 void *pos;
5380 unsigned long pages = 0; 5380 unsigned long pages = 0;
5381 5381
5382 start = (void *)PAGE_ALIGN((unsigned long)start); 5382 start = (void *)PAGE_ALIGN((unsigned long)start);
5383 end = (void *)((unsigned long)end & PAGE_MASK); 5383 end = (void *)((unsigned long)end & PAGE_MASK);
5384 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5384 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5385 if ((unsigned int)poison <= 0xFF) 5385 if ((unsigned int)poison <= 0xFF)
5386 memset(pos, poison, PAGE_SIZE); 5386 memset(pos, poison, PAGE_SIZE);
5387 free_reserved_page(virt_to_page(pos)); 5387 free_reserved_page(virt_to_page(pos));
5388 } 5388 }
5389 5389
5390 if (pages && s) 5390 if (pages && s)
5391 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5391 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5392 s, pages << (PAGE_SHIFT - 10), start, end); 5392 s, pages << (PAGE_SHIFT - 10), start, end);
5393 5393
5394 return pages; 5394 return pages;
5395 } 5395 }
5396 EXPORT_SYMBOL(free_reserved_area); 5396 EXPORT_SYMBOL(free_reserved_area);
5397 5397
5398 #ifdef CONFIG_HIGHMEM 5398 #ifdef CONFIG_HIGHMEM
5399 void free_highmem_page(struct page *page) 5399 void free_highmem_page(struct page *page)
5400 { 5400 {
5401 __free_reserved_page(page); 5401 __free_reserved_page(page);
5402 totalram_pages++; 5402 totalram_pages++;
5403 page_zone(page)->managed_pages++; 5403 page_zone(page)->managed_pages++;
5404 totalhigh_pages++; 5404 totalhigh_pages++;
5405 } 5405 }
5406 #endif 5406 #endif
5407 5407
5408 5408
5409 void __init mem_init_print_info(const char *str) 5409 void __init mem_init_print_info(const char *str)
5410 { 5410 {
5411 unsigned long physpages, codesize, datasize, rosize, bss_size; 5411 unsigned long physpages, codesize, datasize, rosize, bss_size;
5412 unsigned long init_code_size, init_data_size; 5412 unsigned long init_code_size, init_data_size;
5413 5413
5414 physpages = get_num_physpages(); 5414 physpages = get_num_physpages();
5415 codesize = _etext - _stext; 5415 codesize = _etext - _stext;
5416 datasize = _edata - _sdata; 5416 datasize = _edata - _sdata;
5417 rosize = __end_rodata - __start_rodata; 5417 rosize = __end_rodata - __start_rodata;
5418 bss_size = __bss_stop - __bss_start; 5418 bss_size = __bss_stop - __bss_start;
5419 init_data_size = __init_end - __init_begin; 5419 init_data_size = __init_end - __init_begin;
5420 init_code_size = _einittext - _sinittext; 5420 init_code_size = _einittext - _sinittext;
5421 5421
5422 /* 5422 /*
5423 * Detect special cases and adjust section sizes accordingly: 5423 * Detect special cases and adjust section sizes accordingly:
5424 * 1) .init.* may be embedded into .data sections 5424 * 1) .init.* may be embedded into .data sections
5425 * 2) .init.text.* may be out of [__init_begin, __init_end], 5425 * 2) .init.text.* may be out of [__init_begin, __init_end],
5426 * please refer to arch/tile/kernel/vmlinux.lds.S. 5426 * please refer to arch/tile/kernel/vmlinux.lds.S.
5427 * 3) .rodata.* may be embedded into .text or .data sections. 5427 * 3) .rodata.* may be embedded into .text or .data sections.
5428 */ 5428 */
5429 #define adj_init_size(start, end, size, pos, adj) \ 5429 #define adj_init_size(start, end, size, pos, adj) \
5430 do { \ 5430 do { \
5431 if (start <= pos && pos < end && size > adj) \ 5431 if (start <= pos && pos < end && size > adj) \
5432 size -= adj; \ 5432 size -= adj; \
5433 } while (0) 5433 } while (0)
5434 5434
5435 adj_init_size(__init_begin, __init_end, init_data_size, 5435 adj_init_size(__init_begin, __init_end, init_data_size,
5436 _sinittext, init_code_size); 5436 _sinittext, init_code_size);
5437 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5437 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5438 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5438 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5439 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5439 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5440 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5440 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5441 5441
5442 #undef adj_init_size 5442 #undef adj_init_size
5443 5443
5444 printk("Memory: %luK/%luK available " 5444 printk("Memory: %luK/%luK available "
5445 "(%luK kernel code, %luK rwdata, %luK rodata, " 5445 "(%luK kernel code, %luK rwdata, %luK rodata, "
5446 "%luK init, %luK bss, %luK reserved" 5446 "%luK init, %luK bss, %luK reserved"
5447 #ifdef CONFIG_HIGHMEM 5447 #ifdef CONFIG_HIGHMEM
5448 ", %luK highmem" 5448 ", %luK highmem"
5449 #endif 5449 #endif
5450 "%s%s)\n", 5450 "%s%s)\n",
5451 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5451 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5452 codesize >> 10, datasize >> 10, rosize >> 10, 5452 codesize >> 10, datasize >> 10, rosize >> 10,
5453 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5453 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5454 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5454 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5455 #ifdef CONFIG_HIGHMEM 5455 #ifdef CONFIG_HIGHMEM
5456 totalhigh_pages << (PAGE_SHIFT-10), 5456 totalhigh_pages << (PAGE_SHIFT-10),
5457 #endif 5457 #endif
5458 str ? ", " : "", str ? str : ""); 5458 str ? ", " : "", str ? str : "");
5459 } 5459 }
5460 5460
5461 /** 5461 /**
5462 * set_dma_reserve - set the specified number of pages reserved in the first zone 5462 * set_dma_reserve - set the specified number of pages reserved in the first zone
5463 * @new_dma_reserve: The number of pages to mark reserved 5463 * @new_dma_reserve: The number of pages to mark reserved
5464 * 5464 *
5465 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5465 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5466 * In the DMA zone, a significant percentage may be consumed by kernel image 5466 * In the DMA zone, a significant percentage may be consumed by kernel image
5467 * and other unfreeable allocations which can skew the watermarks badly. This 5467 * and other unfreeable allocations which can skew the watermarks badly. This
5468 * function may optionally be used to account for unfreeable pages in the 5468 * function may optionally be used to account for unfreeable pages in the
5469 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5469 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5470 * smaller per-cpu batchsize. 5470 * smaller per-cpu batchsize.
5471 */ 5471 */
5472 void __init set_dma_reserve(unsigned long new_dma_reserve) 5472 void __init set_dma_reserve(unsigned long new_dma_reserve)
5473 { 5473 {
5474 dma_reserve = new_dma_reserve; 5474 dma_reserve = new_dma_reserve;
5475 } 5475 }
5476 5476
5477 void __init free_area_init(unsigned long *zones_size) 5477 void __init free_area_init(unsigned long *zones_size)
5478 { 5478 {
5479 free_area_init_node(0, zones_size, 5479 free_area_init_node(0, zones_size,
5480 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5480 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5481 } 5481 }
5482 5482
5483 static int page_alloc_cpu_notify(struct notifier_block *self, 5483 static int page_alloc_cpu_notify(struct notifier_block *self,
5484 unsigned long action, void *hcpu) 5484 unsigned long action, void *hcpu)
5485 { 5485 {
5486 int cpu = (unsigned long)hcpu; 5486 int cpu = (unsigned long)hcpu;
5487 5487
5488 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5488 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5489 lru_add_drain_cpu(cpu); 5489 lru_add_drain_cpu(cpu);
5490 drain_pages(cpu); 5490 drain_pages(cpu);
5491 5491
5492 /* 5492 /*
5493 * Spill the event counters of the dead processor 5493 * Spill the event counters of the dead processor
5494 * into the current processors event counters. 5494 * into the current processors event counters.
5495 * This artificially elevates the count of the current 5495 * This artificially elevates the count of the current
5496 * processor. 5496 * processor.
5497 */ 5497 */
5498 vm_events_fold_cpu(cpu); 5498 vm_events_fold_cpu(cpu);
5499 5499
5500 /* 5500 /*
5501 * Zero the differential counters of the dead processor 5501 * Zero the differential counters of the dead processor
5502 * so that the vm statistics are consistent. 5502 * so that the vm statistics are consistent.
5503 * 5503 *
5504 * This is only okay since the processor is dead and cannot 5504 * This is only okay since the processor is dead and cannot
5505 * race with what we are doing. 5505 * race with what we are doing.
5506 */ 5506 */
5507 cpu_vm_stats_fold(cpu); 5507 cpu_vm_stats_fold(cpu);
5508 } 5508 }
5509 return NOTIFY_OK; 5509 return NOTIFY_OK;
5510 } 5510 }
5511 5511
5512 void __init page_alloc_init(void) 5512 void __init page_alloc_init(void)
5513 { 5513 {
5514 hotcpu_notifier(page_alloc_cpu_notify, 0); 5514 hotcpu_notifier(page_alloc_cpu_notify, 0);
5515 } 5515 }
5516 5516
5517 /* 5517 /*
5518 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5518 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5519 * or min_free_kbytes changes. 5519 * or min_free_kbytes changes.
5520 */ 5520 */
5521 static void calculate_totalreserve_pages(void) 5521 static void calculate_totalreserve_pages(void)
5522 { 5522 {
5523 struct pglist_data *pgdat; 5523 struct pglist_data *pgdat;
5524 unsigned long reserve_pages = 0; 5524 unsigned long reserve_pages = 0;
5525 enum zone_type i, j; 5525 enum zone_type i, j;
5526 5526
5527 for_each_online_pgdat(pgdat) { 5527 for_each_online_pgdat(pgdat) {
5528 for (i = 0; i < MAX_NR_ZONES; i++) { 5528 for (i = 0; i < MAX_NR_ZONES; i++) {
5529 struct zone *zone = pgdat->node_zones + i; 5529 struct zone *zone = pgdat->node_zones + i;
5530 unsigned long max = 0; 5530 unsigned long max = 0;
5531 5531
5532 /* Find valid and maximum lowmem_reserve in the zone */ 5532 /* Find valid and maximum lowmem_reserve in the zone */
5533 for (j = i; j < MAX_NR_ZONES; j++) { 5533 for (j = i; j < MAX_NR_ZONES; j++) {
5534 if (zone->lowmem_reserve[j] > max) 5534 if (zone->lowmem_reserve[j] > max)
5535 max = zone->lowmem_reserve[j]; 5535 max = zone->lowmem_reserve[j];
5536 } 5536 }
5537 5537
5538 /* we treat the high watermark as reserved pages. */ 5538 /* we treat the high watermark as reserved pages. */
5539 max += high_wmark_pages(zone); 5539 max += high_wmark_pages(zone);
5540 5540
5541 if (max > zone->managed_pages) 5541 if (max > zone->managed_pages)
5542 max = zone->managed_pages; 5542 max = zone->managed_pages;
5543 reserve_pages += max; 5543 reserve_pages += max;
5544 /* 5544 /*
5545 * Lowmem reserves are not available to 5545 * Lowmem reserves are not available to
5546 * GFP_HIGHUSER page cache allocations and 5546 * GFP_HIGHUSER page cache allocations and
5547 * kswapd tries to balance zones to their high 5547 * kswapd tries to balance zones to their high
5548 * watermark. As a result, neither should be 5548 * watermark. As a result, neither should be
5549 * regarded as dirtyable memory, to prevent a 5549 * regarded as dirtyable memory, to prevent a
5550 * situation where reclaim has to clean pages 5550 * situation where reclaim has to clean pages
5551 * in order to balance the zones. 5551 * in order to balance the zones.
5552 */ 5552 */
5553 zone->dirty_balance_reserve = max; 5553 zone->dirty_balance_reserve = max;
5554 } 5554 }
5555 } 5555 }
5556 dirty_balance_reserve = reserve_pages; 5556 dirty_balance_reserve = reserve_pages;
5557 totalreserve_pages = reserve_pages; 5557 totalreserve_pages = reserve_pages;
5558 } 5558 }
5559 5559
5560 /* 5560 /*
5561 * setup_per_zone_lowmem_reserve - called whenever 5561 * setup_per_zone_lowmem_reserve - called whenever
5562 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5562 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5563 * has a correct pages reserved value, so an adequate number of 5563 * has a correct pages reserved value, so an adequate number of
5564 * pages are left in the zone after a successful __alloc_pages(). 5564 * pages are left in the zone after a successful __alloc_pages().
5565 */ 5565 */
5566 static void setup_per_zone_lowmem_reserve(void) 5566 static void setup_per_zone_lowmem_reserve(void)
5567 { 5567 {
5568 struct pglist_data *pgdat; 5568 struct pglist_data *pgdat;
5569 enum zone_type j, idx; 5569 enum zone_type j, idx;
5570 5570
5571 for_each_online_pgdat(pgdat) { 5571 for_each_online_pgdat(pgdat) {
5572 for (j = 0; j < MAX_NR_ZONES; j++) { 5572 for (j = 0; j < MAX_NR_ZONES; j++) {
5573 struct zone *zone = pgdat->node_zones + j; 5573 struct zone *zone = pgdat->node_zones + j;
5574 unsigned long managed_pages = zone->managed_pages; 5574 unsigned long managed_pages = zone->managed_pages;
5575 5575
5576 zone->lowmem_reserve[j] = 0; 5576 zone->lowmem_reserve[j] = 0;
5577 5577
5578 idx = j; 5578 idx = j;
5579 while (idx) { 5579 while (idx) {
5580 struct zone *lower_zone; 5580 struct zone *lower_zone;
5581 5581
5582 idx--; 5582 idx--;
5583 5583
5584 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5584 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5585 sysctl_lowmem_reserve_ratio[idx] = 1; 5585 sysctl_lowmem_reserve_ratio[idx] = 1;
5586 5586
5587 lower_zone = pgdat->node_zones + idx; 5587 lower_zone = pgdat->node_zones + idx;
5588 lower_zone->lowmem_reserve[j] = managed_pages / 5588 lower_zone->lowmem_reserve[j] = managed_pages /
5589 sysctl_lowmem_reserve_ratio[idx]; 5589 sysctl_lowmem_reserve_ratio[idx];
5590 managed_pages += lower_zone->managed_pages; 5590 managed_pages += lower_zone->managed_pages;
5591 } 5591 }
5592 } 5592 }
5593 } 5593 }
5594 5594
5595 /* update totalreserve_pages */ 5595 /* update totalreserve_pages */
5596 calculate_totalreserve_pages(); 5596 calculate_totalreserve_pages();
5597 } 5597 }
5598 5598
5599 static void __setup_per_zone_wmarks(void) 5599 static void __setup_per_zone_wmarks(void)
5600 { 5600 {
5601 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5601 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5602 unsigned long lowmem_pages = 0; 5602 unsigned long lowmem_pages = 0;
5603 struct zone *zone; 5603 struct zone *zone;
5604 unsigned long flags; 5604 unsigned long flags;
5605 5605
5606 /* Calculate total number of !ZONE_HIGHMEM pages */ 5606 /* Calculate total number of !ZONE_HIGHMEM pages */
5607 for_each_zone(zone) { 5607 for_each_zone(zone) {
5608 if (!is_highmem(zone)) 5608 if (!is_highmem(zone))
5609 lowmem_pages += zone->managed_pages; 5609 lowmem_pages += zone->managed_pages;
5610 } 5610 }
5611 5611
5612 for_each_zone(zone) { 5612 for_each_zone(zone) {
5613 u64 tmp; 5613 u64 tmp;
5614 5614
5615 spin_lock_irqsave(&zone->lock, flags); 5615 spin_lock_irqsave(&zone->lock, flags);
5616 tmp = (u64)pages_min * zone->managed_pages; 5616 tmp = (u64)pages_min * zone->managed_pages;
5617 do_div(tmp, lowmem_pages); 5617 do_div(tmp, lowmem_pages);
5618 if (is_highmem(zone)) { 5618 if (is_highmem(zone)) {
5619 /* 5619 /*
5620 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5620 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5621 * need highmem pages, so cap pages_min to a small 5621 * need highmem pages, so cap pages_min to a small
5622 * value here. 5622 * value here.
5623 * 5623 *
5624 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5624 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5625 * deltas controls asynch page reclaim, and so should 5625 * deltas controls asynch page reclaim, and so should
5626 * not be capped for highmem. 5626 * not be capped for highmem.
5627 */ 5627 */
5628 unsigned long min_pages; 5628 unsigned long min_pages;
5629 5629
5630 min_pages = zone->managed_pages / 1024; 5630 min_pages = zone->managed_pages / 1024;
5631 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5631 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5632 zone->watermark[WMARK_MIN] = min_pages; 5632 zone->watermark[WMARK_MIN] = min_pages;
5633 } else { 5633 } else {
5634 /* 5634 /*
5635 * If it's a lowmem zone, reserve a number of pages 5635 * If it's a lowmem zone, reserve a number of pages
5636 * proportionate to the zone's size. 5636 * proportionate to the zone's size.
5637 */ 5637 */
5638 zone->watermark[WMARK_MIN] = tmp; 5638 zone->watermark[WMARK_MIN] = tmp;
5639 } 5639 }
5640 5640
5641 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5641 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5642 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5642 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5643 5643
5644 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5644 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5645 high_wmark_pages(zone) - 5645 high_wmark_pages(zone) -
5646 low_wmark_pages(zone) - 5646 low_wmark_pages(zone) -
5647 zone_page_state(zone, NR_ALLOC_BATCH)); 5647 zone_page_state(zone, NR_ALLOC_BATCH));
5648 5648
5649 setup_zone_migrate_reserve(zone); 5649 setup_zone_migrate_reserve(zone);
5650 spin_unlock_irqrestore(&zone->lock, flags); 5650 spin_unlock_irqrestore(&zone->lock, flags);
5651 } 5651 }
5652 5652
5653 /* update totalreserve_pages */ 5653 /* update totalreserve_pages */
5654 calculate_totalreserve_pages(); 5654 calculate_totalreserve_pages();
5655 } 5655 }
5656 5656
5657 /** 5657 /**
5658 * setup_per_zone_wmarks - called when min_free_kbytes changes 5658 * setup_per_zone_wmarks - called when min_free_kbytes changes
5659 * or when memory is hot-{added|removed} 5659 * or when memory is hot-{added|removed}
5660 * 5660 *
5661 * Ensures that the watermark[min,low,high] values for each zone are set 5661 * Ensures that the watermark[min,low,high] values for each zone are set
5662 * correctly with respect to min_free_kbytes. 5662 * correctly with respect to min_free_kbytes.
5663 */ 5663 */
5664 void setup_per_zone_wmarks(void) 5664 void setup_per_zone_wmarks(void)
5665 { 5665 {
5666 mutex_lock(&zonelists_mutex); 5666 mutex_lock(&zonelists_mutex);
5667 __setup_per_zone_wmarks(); 5667 __setup_per_zone_wmarks();
5668 mutex_unlock(&zonelists_mutex); 5668 mutex_unlock(&zonelists_mutex);
5669 } 5669 }
5670 5670
5671 /* 5671 /*
5672 * The inactive anon list should be small enough that the VM never has to 5672 * The inactive anon list should be small enough that the VM never has to
5673 * do too much work, but large enough that each inactive page has a chance 5673 * do too much work, but large enough that each inactive page has a chance
5674 * to be referenced again before it is swapped out. 5674 * to be referenced again before it is swapped out.
5675 * 5675 *
5676 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5676 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5677 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5677 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5678 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5678 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5679 * the anonymous pages are kept on the inactive list. 5679 * the anonymous pages are kept on the inactive list.
5680 * 5680 *
5681 * total target max 5681 * total target max
5682 * memory ratio inactive anon 5682 * memory ratio inactive anon
5683 * ------------------------------------- 5683 * -------------------------------------
5684 * 10MB 1 5MB 5684 * 10MB 1 5MB
5685 * 100MB 1 50MB 5685 * 100MB 1 50MB
5686 * 1GB 3 250MB 5686 * 1GB 3 250MB
5687 * 10GB 10 0.9GB 5687 * 10GB 10 0.9GB
5688 * 100GB 31 3GB 5688 * 100GB 31 3GB
5689 * 1TB 101 10GB 5689 * 1TB 101 10GB
5690 * 10TB 320 32GB 5690 * 10TB 320 32GB
5691 */ 5691 */
5692 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5692 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5693 { 5693 {
5694 unsigned int gb, ratio; 5694 unsigned int gb, ratio;
5695 5695
5696 /* Zone size in gigabytes */ 5696 /* Zone size in gigabytes */
5697 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5697 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5698 if (gb) 5698 if (gb)
5699 ratio = int_sqrt(10 * gb); 5699 ratio = int_sqrt(10 * gb);
5700 else 5700 else
5701 ratio = 1; 5701 ratio = 1;
5702 5702
5703 zone->inactive_ratio = ratio; 5703 zone->inactive_ratio = ratio;
5704 } 5704 }
5705 5705
5706 static void __meminit setup_per_zone_inactive_ratio(void) 5706 static void __meminit setup_per_zone_inactive_ratio(void)
5707 { 5707 {
5708 struct zone *zone; 5708 struct zone *zone;
5709 5709
5710 for_each_zone(zone) 5710 for_each_zone(zone)
5711 calculate_zone_inactive_ratio(zone); 5711 calculate_zone_inactive_ratio(zone);
5712 } 5712 }
5713 5713
5714 /* 5714 /*
5715 * Initialise min_free_kbytes. 5715 * Initialise min_free_kbytes.
5716 * 5716 *
5717 * For small machines we want it small (128k min). For large machines 5717 * For small machines we want it small (128k min). For large machines
5718 * we want it large (64MB max). But it is not linear, because network 5718 * we want it large (64MB max). But it is not linear, because network
5719 * bandwidth does not increase linearly with machine size. We use 5719 * bandwidth does not increase linearly with machine size. We use
5720 * 5720 *
5721 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5721 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5722 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5722 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5723 * 5723 *
5724 * which yields 5724 * which yields
5725 * 5725 *
5726 * 16MB: 512k 5726 * 16MB: 512k
5727 * 32MB: 724k 5727 * 32MB: 724k
5728 * 64MB: 1024k 5728 * 64MB: 1024k
5729 * 128MB: 1448k 5729 * 128MB: 1448k
5730 * 256MB: 2048k 5730 * 256MB: 2048k
5731 * 512MB: 2896k 5731 * 512MB: 2896k
5732 * 1024MB: 4096k 5732 * 1024MB: 4096k
5733 * 2048MB: 5792k 5733 * 2048MB: 5792k
5734 * 4096MB: 8192k 5734 * 4096MB: 8192k
5735 * 8192MB: 11584k 5735 * 8192MB: 11584k
5736 * 16384MB: 16384k 5736 * 16384MB: 16384k
5737 */ 5737 */
5738 int __meminit init_per_zone_wmark_min(void) 5738 int __meminit init_per_zone_wmark_min(void)
5739 { 5739 {
5740 unsigned long lowmem_kbytes; 5740 unsigned long lowmem_kbytes;
5741 int new_min_free_kbytes; 5741 int new_min_free_kbytes;
5742 5742
5743 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5743 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5744 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5744 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5745 5745
5746 if (new_min_free_kbytes > user_min_free_kbytes) { 5746 if (new_min_free_kbytes > user_min_free_kbytes) {
5747 min_free_kbytes = new_min_free_kbytes; 5747 min_free_kbytes = new_min_free_kbytes;
5748 if (min_free_kbytes < 128) 5748 if (min_free_kbytes < 128)
5749 min_free_kbytes = 128; 5749 min_free_kbytes = 128;
5750 if (min_free_kbytes > 65536) 5750 if (min_free_kbytes > 65536)
5751 min_free_kbytes = 65536; 5751 min_free_kbytes = 65536;
5752 } else { 5752 } else {
5753 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5753 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5754 new_min_free_kbytes, user_min_free_kbytes); 5754 new_min_free_kbytes, user_min_free_kbytes);
5755 } 5755 }
5756 setup_per_zone_wmarks(); 5756 setup_per_zone_wmarks();
5757 refresh_zone_stat_thresholds(); 5757 refresh_zone_stat_thresholds();
5758 setup_per_zone_lowmem_reserve(); 5758 setup_per_zone_lowmem_reserve();
5759 setup_per_zone_inactive_ratio(); 5759 setup_per_zone_inactive_ratio();
5760 return 0; 5760 return 0;
5761 } 5761 }
5762 module_init(init_per_zone_wmark_min) 5762 module_init(init_per_zone_wmark_min)
5763 5763
5764 /* 5764 /*
5765 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5765 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5766 * that we can call two helper functions whenever min_free_kbytes 5766 * that we can call two helper functions whenever min_free_kbytes
5767 * changes. 5767 * changes.
5768 */ 5768 */
5769 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5769 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5770 void __user *buffer, size_t *length, loff_t *ppos) 5770 void __user *buffer, size_t *length, loff_t *ppos)
5771 { 5771 {
5772 int rc; 5772 int rc;
5773 5773
5774 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5774 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5775 if (rc) 5775 if (rc)
5776 return rc; 5776 return rc;
5777 5777
5778 if (write) { 5778 if (write) {
5779 user_min_free_kbytes = min_free_kbytes; 5779 user_min_free_kbytes = min_free_kbytes;
5780 setup_per_zone_wmarks(); 5780 setup_per_zone_wmarks();
5781 } 5781 }
5782 return 0; 5782 return 0;
5783 } 5783 }
5784 5784
5785 #ifdef CONFIG_NUMA 5785 #ifdef CONFIG_NUMA
5786 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5786 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5787 void __user *buffer, size_t *length, loff_t *ppos) 5787 void __user *buffer, size_t *length, loff_t *ppos)
5788 { 5788 {
5789 struct zone *zone; 5789 struct zone *zone;
5790 int rc; 5790 int rc;
5791 5791
5792 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5792 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5793 if (rc) 5793 if (rc)
5794 return rc; 5794 return rc;
5795 5795
5796 for_each_zone(zone) 5796 for_each_zone(zone)
5797 zone->min_unmapped_pages = (zone->managed_pages * 5797 zone->min_unmapped_pages = (zone->managed_pages *
5798 sysctl_min_unmapped_ratio) / 100; 5798 sysctl_min_unmapped_ratio) / 100;
5799 return 0; 5799 return 0;
5800 } 5800 }
5801 5801
5802 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5802 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5803 void __user *buffer, size_t *length, loff_t *ppos) 5803 void __user *buffer, size_t *length, loff_t *ppos)
5804 { 5804 {
5805 struct zone *zone; 5805 struct zone *zone;
5806 int rc; 5806 int rc;
5807 5807
5808 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5808 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5809 if (rc) 5809 if (rc)
5810 return rc; 5810 return rc;
5811 5811
5812 for_each_zone(zone) 5812 for_each_zone(zone)
5813 zone->min_slab_pages = (zone->managed_pages * 5813 zone->min_slab_pages = (zone->managed_pages *
5814 sysctl_min_slab_ratio) / 100; 5814 sysctl_min_slab_ratio) / 100;
5815 return 0; 5815 return 0;
5816 } 5816 }
5817 #endif 5817 #endif
5818 5818
5819 /* 5819 /*
5820 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5820 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5821 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5821 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5822 * whenever sysctl_lowmem_reserve_ratio changes. 5822 * whenever sysctl_lowmem_reserve_ratio changes.
5823 * 5823 *
5824 * The reserve ratio obviously has absolutely no relation with the 5824 * The reserve ratio obviously has absolutely no relation with the
5825 * minimum watermarks. The lowmem reserve ratio can only make sense 5825 * minimum watermarks. The lowmem reserve ratio can only make sense
5826 * if in function of the boot time zone sizes. 5826 * if in function of the boot time zone sizes.
5827 */ 5827 */
5828 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5828 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5829 void __user *buffer, size_t *length, loff_t *ppos) 5829 void __user *buffer, size_t *length, loff_t *ppos)
5830 { 5830 {
5831 proc_dointvec_minmax(table, write, buffer, length, ppos); 5831 proc_dointvec_minmax(table, write, buffer, length, ppos);
5832 setup_per_zone_lowmem_reserve(); 5832 setup_per_zone_lowmem_reserve();
5833 return 0; 5833 return 0;
5834 } 5834 }
5835 5835
5836 /* 5836 /*
5837 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5837 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5838 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5838 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5839 * pagelist can have before it gets flushed back to buddy allocator. 5839 * pagelist can have before it gets flushed back to buddy allocator.
5840 */ 5840 */
5841 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5841 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5842 void __user *buffer, size_t *length, loff_t *ppos) 5842 void __user *buffer, size_t *length, loff_t *ppos)
5843 { 5843 {
5844 struct zone *zone; 5844 struct zone *zone;
5845 int old_percpu_pagelist_fraction; 5845 int old_percpu_pagelist_fraction;
5846 int ret; 5846 int ret;
5847 5847
5848 mutex_lock(&pcp_batch_high_lock); 5848 mutex_lock(&pcp_batch_high_lock);
5849 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 5849 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5850 5850
5851 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5851 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5852 if (!write || ret < 0) 5852 if (!write || ret < 0)
5853 goto out; 5853 goto out;
5854 5854
5855 /* Sanity checking to avoid pcp imbalance */ 5855 /* Sanity checking to avoid pcp imbalance */
5856 if (percpu_pagelist_fraction && 5856 if (percpu_pagelist_fraction &&
5857 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 5857 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5858 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 5858 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5859 ret = -EINVAL; 5859 ret = -EINVAL;
5860 goto out; 5860 goto out;
5861 } 5861 }
5862 5862
5863 /* No change? */ 5863 /* No change? */
5864 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 5864 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5865 goto out; 5865 goto out;
5866 5866
5867 for_each_populated_zone(zone) { 5867 for_each_populated_zone(zone) {
5868 unsigned int cpu; 5868 unsigned int cpu;
5869 5869
5870 for_each_possible_cpu(cpu) 5870 for_each_possible_cpu(cpu)
5871 pageset_set_high_and_batch(zone, 5871 pageset_set_high_and_batch(zone,
5872 per_cpu_ptr(zone->pageset, cpu)); 5872 per_cpu_ptr(zone->pageset, cpu));
5873 } 5873 }
5874 out: 5874 out:
5875 mutex_unlock(&pcp_batch_high_lock); 5875 mutex_unlock(&pcp_batch_high_lock);
5876 return ret; 5876 return ret;
5877 } 5877 }
5878 5878
5879 int hashdist = HASHDIST_DEFAULT; 5879 int hashdist = HASHDIST_DEFAULT;
5880 5880
5881 #ifdef CONFIG_NUMA 5881 #ifdef CONFIG_NUMA
5882 static int __init set_hashdist(char *str) 5882 static int __init set_hashdist(char *str)
5883 { 5883 {
5884 if (!str) 5884 if (!str)
5885 return 0; 5885 return 0;
5886 hashdist = simple_strtoul(str, &str, 0); 5886 hashdist = simple_strtoul(str, &str, 0);
5887 return 1; 5887 return 1;
5888 } 5888 }
5889 __setup("hashdist=", set_hashdist); 5889 __setup("hashdist=", set_hashdist);
5890 #endif 5890 #endif
5891 5891
5892 /* 5892 /*
5893 * allocate a large system hash table from bootmem 5893 * allocate a large system hash table from bootmem
5894 * - it is assumed that the hash table must contain an exact power-of-2 5894 * - it is assumed that the hash table must contain an exact power-of-2
5895 * quantity of entries 5895 * quantity of entries
5896 * - limit is the number of hash buckets, not the total allocation size 5896 * - limit is the number of hash buckets, not the total allocation size
5897 */ 5897 */
5898 void *__init alloc_large_system_hash(const char *tablename, 5898 void *__init alloc_large_system_hash(const char *tablename,
5899 unsigned long bucketsize, 5899 unsigned long bucketsize,
5900 unsigned long numentries, 5900 unsigned long numentries,
5901 int scale, 5901 int scale,
5902 int flags, 5902 int flags,
5903 unsigned int *_hash_shift, 5903 unsigned int *_hash_shift,
5904 unsigned int *_hash_mask, 5904 unsigned int *_hash_mask,
5905 unsigned long low_limit, 5905 unsigned long low_limit,
5906 unsigned long high_limit) 5906 unsigned long high_limit)
5907 { 5907 {
5908 unsigned long long max = high_limit; 5908 unsigned long long max = high_limit;
5909 unsigned long log2qty, size; 5909 unsigned long log2qty, size;
5910 void *table = NULL; 5910 void *table = NULL;
5911 5911
5912 /* allow the kernel cmdline to have a say */ 5912 /* allow the kernel cmdline to have a say */
5913 if (!numentries) { 5913 if (!numentries) {
5914 /* round applicable memory size up to nearest megabyte */ 5914 /* round applicable memory size up to nearest megabyte */
5915 numentries = nr_kernel_pages; 5915 numentries = nr_kernel_pages;
5916 5916
5917 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5917 /* It isn't necessary when PAGE_SIZE >= 1MB */
5918 if (PAGE_SHIFT < 20) 5918 if (PAGE_SHIFT < 20)
5919 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5919 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5920 5920
5921 /* limit to 1 bucket per 2^scale bytes of low memory */ 5921 /* limit to 1 bucket per 2^scale bytes of low memory */
5922 if (scale > PAGE_SHIFT) 5922 if (scale > PAGE_SHIFT)
5923 numentries >>= (scale - PAGE_SHIFT); 5923 numentries >>= (scale - PAGE_SHIFT);
5924 else 5924 else
5925 numentries <<= (PAGE_SHIFT - scale); 5925 numentries <<= (PAGE_SHIFT - scale);
5926 5926
5927 /* Make sure we've got at least a 0-order allocation.. */ 5927 /* Make sure we've got at least a 0-order allocation.. */
5928 if (unlikely(flags & HASH_SMALL)) { 5928 if (unlikely(flags & HASH_SMALL)) {
5929 /* Makes no sense without HASH_EARLY */ 5929 /* Makes no sense without HASH_EARLY */
5930 WARN_ON(!(flags & HASH_EARLY)); 5930 WARN_ON(!(flags & HASH_EARLY));
5931 if (!(numentries >> *_hash_shift)) { 5931 if (!(numentries >> *_hash_shift)) {
5932 numentries = 1UL << *_hash_shift; 5932 numentries = 1UL << *_hash_shift;
5933 BUG_ON(!numentries); 5933 BUG_ON(!numentries);
5934 } 5934 }
5935 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5935 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5936 numentries = PAGE_SIZE / bucketsize; 5936 numentries = PAGE_SIZE / bucketsize;
5937 } 5937 }
5938 numentries = roundup_pow_of_two(numentries); 5938 numentries = roundup_pow_of_two(numentries);
5939 5939
5940 /* limit allocation size to 1/16 total memory by default */ 5940 /* limit allocation size to 1/16 total memory by default */
5941 if (max == 0) { 5941 if (max == 0) {
5942 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5942 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5943 do_div(max, bucketsize); 5943 do_div(max, bucketsize);
5944 } 5944 }
5945 max = min(max, 0x80000000ULL); 5945 max = min(max, 0x80000000ULL);
5946 5946
5947 if (numentries < low_limit) 5947 if (numentries < low_limit)
5948 numentries = low_limit; 5948 numentries = low_limit;
5949 if (numentries > max) 5949 if (numentries > max)
5950 numentries = max; 5950 numentries = max;
5951 5951
5952 log2qty = ilog2(numentries); 5952 log2qty = ilog2(numentries);
5953 5953
5954 do { 5954 do {
5955 size = bucketsize << log2qty; 5955 size = bucketsize << log2qty;
5956 if (flags & HASH_EARLY) 5956 if (flags & HASH_EARLY)
5957 table = alloc_bootmem_nopanic(size); 5957 table = alloc_bootmem_nopanic(size);
5958 else if (hashdist) 5958 else if (hashdist)
5959 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5959 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5960 else { 5960 else {
5961 /* 5961 /*
5962 * If bucketsize is not a power-of-two, we may free 5962 * If bucketsize is not a power-of-two, we may free
5963 * some pages at the end of hash table which 5963 * some pages at the end of hash table which
5964 * alloc_pages_exact() automatically does 5964 * alloc_pages_exact() automatically does
5965 */ 5965 */
5966 if (get_order(size) < MAX_ORDER) { 5966 if (get_order(size) < MAX_ORDER) {
5967 table = alloc_pages_exact(size, GFP_ATOMIC); 5967 table = alloc_pages_exact(size, GFP_ATOMIC);
5968 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5968 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5969 } 5969 }
5970 } 5970 }
5971 } while (!table && size > PAGE_SIZE && --log2qty); 5971 } while (!table && size > PAGE_SIZE && --log2qty);
5972 5972
5973 if (!table) 5973 if (!table)
5974 panic("Failed to allocate %s hash table\n", tablename); 5974 panic("Failed to allocate %s hash table\n", tablename);
5975 5975
5976 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5976 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5977 tablename, 5977 tablename,
5978 (1UL << log2qty), 5978 (1UL << log2qty),
5979 ilog2(size) - PAGE_SHIFT, 5979 ilog2(size) - PAGE_SHIFT,
5980 size); 5980 size);
5981 5981
5982 if (_hash_shift) 5982 if (_hash_shift)
5983 *_hash_shift = log2qty; 5983 *_hash_shift = log2qty;
5984 if (_hash_mask) 5984 if (_hash_mask)
5985 *_hash_mask = (1 << log2qty) - 1; 5985 *_hash_mask = (1 << log2qty) - 1;
5986 5986
5987 return table; 5987 return table;
5988 } 5988 }
5989 5989
5990 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5990 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5991 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5991 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5992 unsigned long pfn) 5992 unsigned long pfn)
5993 { 5993 {
5994 #ifdef CONFIG_SPARSEMEM 5994 #ifdef CONFIG_SPARSEMEM
5995 return __pfn_to_section(pfn)->pageblock_flags; 5995 return __pfn_to_section(pfn)->pageblock_flags;
5996 #else 5996 #else
5997 return zone->pageblock_flags; 5997 return zone->pageblock_flags;
5998 #endif /* CONFIG_SPARSEMEM */ 5998 #endif /* CONFIG_SPARSEMEM */
5999 } 5999 }
6000 6000
6001 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 6001 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6002 { 6002 {
6003 #ifdef CONFIG_SPARSEMEM 6003 #ifdef CONFIG_SPARSEMEM
6004 pfn &= (PAGES_PER_SECTION-1); 6004 pfn &= (PAGES_PER_SECTION-1);
6005 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6005 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
6006 #else 6006 #else
6007 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 6007 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
6008 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6008 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
6009 #endif /* CONFIG_SPARSEMEM */ 6009 #endif /* CONFIG_SPARSEMEM */
6010 } 6010 }
6011 6011
6012 /** 6012 /**
6013 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 6013 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
6014 * @page: The page within the block of interest 6014 * @page: The page within the block of interest
6015 * @start_bitidx: The first bit of interest to retrieve 6015 * @start_bitidx: The first bit of interest to retrieve
6016 * @end_bitidx: The last bit of interest 6016 * @end_bitidx: The last bit of interest
6017 * returns pageblock_bits flags 6017 * returns pageblock_bits flags
6018 */ 6018 */
6019 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 6019 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
6020 unsigned long end_bitidx, 6020 unsigned long end_bitidx,
6021 unsigned long mask) 6021 unsigned long mask)
6022 { 6022 {
6023 struct zone *zone; 6023 struct zone *zone;
6024 unsigned long *bitmap; 6024 unsigned long *bitmap;
6025 unsigned long bitidx, word_bitidx; 6025 unsigned long bitidx, word_bitidx;
6026 unsigned long word; 6026 unsigned long word;
6027 6027
6028 zone = page_zone(page); 6028 zone = page_zone(page);
6029 bitmap = get_pageblock_bitmap(zone, pfn); 6029 bitmap = get_pageblock_bitmap(zone, pfn);
6030 bitidx = pfn_to_bitidx(zone, pfn); 6030 bitidx = pfn_to_bitidx(zone, pfn);
6031 word_bitidx = bitidx / BITS_PER_LONG; 6031 word_bitidx = bitidx / BITS_PER_LONG;
6032 bitidx &= (BITS_PER_LONG-1); 6032 bitidx &= (BITS_PER_LONG-1);
6033 6033
6034 word = bitmap[word_bitidx]; 6034 word = bitmap[word_bitidx];
6035 bitidx += end_bitidx; 6035 bitidx += end_bitidx;
6036 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6036 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6037 } 6037 }
6038 6038
6039 /** 6039 /**
6040 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6040 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6041 * @page: The page within the block of interest 6041 * @page: The page within the block of interest
6042 * @start_bitidx: The first bit of interest 6042 * @start_bitidx: The first bit of interest
6043 * @end_bitidx: The last bit of interest 6043 * @end_bitidx: The last bit of interest
6044 * @flags: The flags to set 6044 * @flags: The flags to set
6045 */ 6045 */
6046 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 6046 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6047 unsigned long pfn, 6047 unsigned long pfn,
6048 unsigned long end_bitidx, 6048 unsigned long end_bitidx,
6049 unsigned long mask) 6049 unsigned long mask)
6050 { 6050 {
6051 struct zone *zone; 6051 struct zone *zone;
6052 unsigned long *bitmap; 6052 unsigned long *bitmap;
6053 unsigned long bitidx, word_bitidx; 6053 unsigned long bitidx, word_bitidx;
6054 unsigned long old_word, word; 6054 unsigned long old_word, word;
6055 6055
6056 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6056 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6057 6057
6058 zone = page_zone(page); 6058 zone = page_zone(page);
6059 bitmap = get_pageblock_bitmap(zone, pfn); 6059 bitmap = get_pageblock_bitmap(zone, pfn);
6060 bitidx = pfn_to_bitidx(zone, pfn); 6060 bitidx = pfn_to_bitidx(zone, pfn);
6061 word_bitidx = bitidx / BITS_PER_LONG; 6061 word_bitidx = bitidx / BITS_PER_LONG;
6062 bitidx &= (BITS_PER_LONG-1); 6062 bitidx &= (BITS_PER_LONG-1);
6063 6063
6064 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6064 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
6065 6065
6066 bitidx += end_bitidx; 6066 bitidx += end_bitidx;
6067 mask <<= (BITS_PER_LONG - bitidx - 1); 6067 mask <<= (BITS_PER_LONG - bitidx - 1);
6068 flags <<= (BITS_PER_LONG - bitidx - 1); 6068 flags <<= (BITS_PER_LONG - bitidx - 1);
6069 6069
6070 word = ACCESS_ONCE(bitmap[word_bitidx]); 6070 word = ACCESS_ONCE(bitmap[word_bitidx]);
6071 for (;;) { 6071 for (;;) {
6072 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6072 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6073 if (word == old_word) 6073 if (word == old_word)
6074 break; 6074 break;
6075 word = old_word; 6075 word = old_word;
6076 } 6076 }
6077 } 6077 }
6078 6078
6079 /* 6079 /*
6080 * This function checks whether pageblock includes unmovable pages or not. 6080 * This function checks whether pageblock includes unmovable pages or not.
6081 * If @count is not zero, it is okay to include less @count unmovable pages 6081 * If @count is not zero, it is okay to include less @count unmovable pages
6082 * 6082 *
6083 * PageLRU check without isolation or lru_lock could race so that 6083 * PageLRU check without isolation or lru_lock could race so that
6084 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6084 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6085 * expect this function should be exact. 6085 * expect this function should be exact.
6086 */ 6086 */
6087 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6087 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6088 bool skip_hwpoisoned_pages) 6088 bool skip_hwpoisoned_pages)
6089 { 6089 {
6090 unsigned long pfn, iter, found; 6090 unsigned long pfn, iter, found;
6091 int mt; 6091 int mt;
6092 6092
6093 /* 6093 /*
6094 * For avoiding noise data, lru_add_drain_all() should be called 6094 * For avoiding noise data, lru_add_drain_all() should be called
6095 * If ZONE_MOVABLE, the zone never contains unmovable pages 6095 * If ZONE_MOVABLE, the zone never contains unmovable pages
6096 */ 6096 */
6097 if (zone_idx(zone) == ZONE_MOVABLE) 6097 if (zone_idx(zone) == ZONE_MOVABLE)
6098 return false; 6098 return false;
6099 mt = get_pageblock_migratetype(page); 6099 mt = get_pageblock_migratetype(page);
6100 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6100 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
6101 return false; 6101 return false;
6102 6102
6103 pfn = page_to_pfn(page); 6103 pfn = page_to_pfn(page);
6104 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6104 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6105 unsigned long check = pfn + iter; 6105 unsigned long check = pfn + iter;
6106 6106
6107 if (!pfn_valid_within(check)) 6107 if (!pfn_valid_within(check))
6108 continue; 6108 continue;
6109 6109
6110 page = pfn_to_page(check); 6110 page = pfn_to_page(check);
6111 6111
6112 /* 6112 /*
6113 * Hugepages are not in LRU lists, but they're movable. 6113 * Hugepages are not in LRU lists, but they're movable.
6114 * We need not scan over tail pages bacause we don't 6114 * We need not scan over tail pages bacause we don't
6115 * handle each tail page individually in migration. 6115 * handle each tail page individually in migration.
6116 */ 6116 */
6117 if (PageHuge(page)) { 6117 if (PageHuge(page)) {
6118 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6118 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6119 continue; 6119 continue;
6120 } 6120 }
6121 6121
6122 /* 6122 /*
6123 * We can't use page_count without pin a page 6123 * We can't use page_count without pin a page
6124 * because another CPU can free compound page. 6124 * because another CPU can free compound page.
6125 * This check already skips compound tails of THP 6125 * This check already skips compound tails of THP
6126 * because their page->_count is zero at all time. 6126 * because their page->_count is zero at all time.
6127 */ 6127 */
6128 if (!atomic_read(&page->_count)) { 6128 if (!atomic_read(&page->_count)) {
6129 if (PageBuddy(page)) 6129 if (PageBuddy(page))
6130 iter += (1 << page_order(page)) - 1; 6130 iter += (1 << page_order(page)) - 1;
6131 continue; 6131 continue;
6132 } 6132 }
6133 6133
6134 /* 6134 /*
6135 * The HWPoisoned page may be not in buddy system, and 6135 * The HWPoisoned page may be not in buddy system, and
6136 * page_count() is not 0. 6136 * page_count() is not 0.
6137 */ 6137 */
6138 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6138 if (skip_hwpoisoned_pages && PageHWPoison(page))
6139 continue; 6139 continue;
6140 6140
6141 if (!PageLRU(page)) 6141 if (!PageLRU(page))
6142 found++; 6142 found++;
6143 /* 6143 /*
6144 * If there are RECLAIMABLE pages, we need to check it. 6144 * If there are RECLAIMABLE pages, we need to check it.
6145 * But now, memory offline itself doesn't call shrink_slab() 6145 * But now, memory offline itself doesn't call shrink_slab()
6146 * and it still to be fixed. 6146 * and it still to be fixed.
6147 */ 6147 */
6148 /* 6148 /*
6149 * If the page is not RAM, page_count()should be 0. 6149 * If the page is not RAM, page_count()should be 0.
6150 * we don't need more check. This is an _used_ not-movable page. 6150 * we don't need more check. This is an _used_ not-movable page.
6151 * 6151 *
6152 * The problematic thing here is PG_reserved pages. PG_reserved 6152 * The problematic thing here is PG_reserved pages. PG_reserved
6153 * is set to both of a memory hole page and a _used_ kernel 6153 * is set to both of a memory hole page and a _used_ kernel
6154 * page at boot. 6154 * page at boot.
6155 */ 6155 */
6156 if (found > count) 6156 if (found > count)
6157 return true; 6157 return true;
6158 } 6158 }
6159 return false; 6159 return false;
6160 } 6160 }
6161 6161
6162 bool is_pageblock_removable_nolock(struct page *page) 6162 bool is_pageblock_removable_nolock(struct page *page)
6163 { 6163 {
6164 struct zone *zone; 6164 struct zone *zone;
6165 unsigned long pfn; 6165 unsigned long pfn;
6166 6166
6167 /* 6167 /*
6168 * We have to be careful here because we are iterating over memory 6168 * We have to be careful here because we are iterating over memory
6169 * sections which are not zone aware so we might end up outside of 6169 * sections which are not zone aware so we might end up outside of
6170 * the zone but still within the section. 6170 * the zone but still within the section.
6171 * We have to take care about the node as well. If the node is offline 6171 * We have to take care about the node as well. If the node is offline
6172 * its NODE_DATA will be NULL - see page_zone. 6172 * its NODE_DATA will be NULL - see page_zone.
6173 */ 6173 */
6174 if (!node_online(page_to_nid(page))) 6174 if (!node_online(page_to_nid(page)))
6175 return false; 6175 return false;
6176 6176
6177 zone = page_zone(page); 6177 zone = page_zone(page);
6178 pfn = page_to_pfn(page); 6178 pfn = page_to_pfn(page);
6179 if (!zone_spans_pfn(zone, pfn)) 6179 if (!zone_spans_pfn(zone, pfn))
6180 return false; 6180 return false;
6181 6181
6182 return !has_unmovable_pages(zone, page, 0, true); 6182 return !has_unmovable_pages(zone, page, 0, true);
6183 } 6183 }
6184 6184
6185 #ifdef CONFIG_CMA 6185 #ifdef CONFIG_CMA
6186 6186
6187 static unsigned long pfn_max_align_down(unsigned long pfn) 6187 static unsigned long pfn_max_align_down(unsigned long pfn)
6188 { 6188 {
6189 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6189 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6190 pageblock_nr_pages) - 1); 6190 pageblock_nr_pages) - 1);
6191 } 6191 }
6192 6192
6193 static unsigned long pfn_max_align_up(unsigned long pfn) 6193 static unsigned long pfn_max_align_up(unsigned long pfn)
6194 { 6194 {
6195 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6195 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6196 pageblock_nr_pages)); 6196 pageblock_nr_pages));
6197 } 6197 }
6198 6198
6199 /* [start, end) must belong to a single zone. */ 6199 /* [start, end) must belong to a single zone. */
6200 static int __alloc_contig_migrate_range(struct compact_control *cc, 6200 static int __alloc_contig_migrate_range(struct compact_control *cc,
6201 unsigned long start, unsigned long end) 6201 unsigned long start, unsigned long end)
6202 { 6202 {
6203 /* This function is based on compact_zone() from compaction.c. */ 6203 /* This function is based on compact_zone() from compaction.c. */
6204 unsigned long nr_reclaimed; 6204 unsigned long nr_reclaimed;
6205 unsigned long pfn = start; 6205 unsigned long pfn = start;
6206 unsigned int tries = 0; 6206 unsigned int tries = 0;
6207 int ret = 0; 6207 int ret = 0;
6208 6208
6209 migrate_prep(); 6209 migrate_prep();
6210 6210
6211 while (pfn < end || !list_empty(&cc->migratepages)) { 6211 while (pfn < end || !list_empty(&cc->migratepages)) {
6212 if (fatal_signal_pending(current)) { 6212 if (fatal_signal_pending(current)) {
6213 ret = -EINTR; 6213 ret = -EINTR;
6214 break; 6214 break;
6215 } 6215 }
6216 6216
6217 if (list_empty(&cc->migratepages)) { 6217 if (list_empty(&cc->migratepages)) {
6218 cc->nr_migratepages = 0; 6218 cc->nr_migratepages = 0;
6219 pfn = isolate_migratepages_range(cc->zone, cc, 6219 pfn = isolate_migratepages_range(cc->zone, cc,
6220 pfn, end, true); 6220 pfn, end, true);
6221 if (!pfn) { 6221 if (!pfn) {
6222 ret = -EINTR; 6222 ret = -EINTR;
6223 break; 6223 break;
6224 } 6224 }
6225 tries = 0; 6225 tries = 0;
6226 } else if (++tries == 5) { 6226 } else if (++tries == 5) {
6227 ret = ret < 0 ? ret : -EBUSY; 6227 ret = ret < 0 ? ret : -EBUSY;
6228 break; 6228 break;
6229 } 6229 }
6230 6230
6231 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6231 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6232 &cc->migratepages); 6232 &cc->migratepages);
6233 cc->nr_migratepages -= nr_reclaimed; 6233 cc->nr_migratepages -= nr_reclaimed;
6234 6234
6235 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6235 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6236 NULL, 0, cc->mode, MR_CMA); 6236 NULL, 0, cc->mode, MR_CMA);
6237 } 6237 }
6238 if (ret < 0) { 6238 if (ret < 0) {
6239 putback_movable_pages(&cc->migratepages); 6239 putback_movable_pages(&cc->migratepages);
6240 return ret; 6240 return ret;
6241 } 6241 }
6242 return 0; 6242 return 0;
6243 } 6243 }
6244 6244
6245 /** 6245 /**
6246 * alloc_contig_range() -- tries to allocate given range of pages 6246 * alloc_contig_range() -- tries to allocate given range of pages
6247 * @start: start PFN to allocate 6247 * @start: start PFN to allocate
6248 * @end: one-past-the-last PFN to allocate 6248 * @end: one-past-the-last PFN to allocate
6249 * @migratetype: migratetype of the underlaying pageblocks (either 6249 * @migratetype: migratetype of the underlaying pageblocks (either
6250 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6250 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6251 * in range must have the same migratetype and it must 6251 * in range must have the same migratetype and it must
6252 * be either of the two. 6252 * be either of the two.
6253 * 6253 *
6254 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6254 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6255 * aligned, however it's the caller's responsibility to guarantee that 6255 * aligned, however it's the caller's responsibility to guarantee that
6256 * we are the only thread that changes migrate type of pageblocks the 6256 * we are the only thread that changes migrate type of pageblocks the
6257 * pages fall in. 6257 * pages fall in.
6258 * 6258 *
6259 * The PFN range must belong to a single zone. 6259 * The PFN range must belong to a single zone.
6260 * 6260 *
6261 * Returns zero on success or negative error code. On success all 6261 * Returns zero on success or negative error code. On success all
6262 * pages which PFN is in [start, end) are allocated for the caller and 6262 * pages which PFN is in [start, end) are allocated for the caller and
6263 * need to be freed with free_contig_range(). 6263 * need to be freed with free_contig_range().
6264 */ 6264 */
6265 int alloc_contig_range(unsigned long start, unsigned long end, 6265 int alloc_contig_range(unsigned long start, unsigned long end,
6266 unsigned migratetype) 6266 unsigned migratetype)
6267 { 6267 {
6268 unsigned long outer_start, outer_end; 6268 unsigned long outer_start, outer_end;
6269 int ret = 0, order; 6269 int ret = 0, order;
6270 6270
6271 struct compact_control cc = { 6271 struct compact_control cc = {
6272 .nr_migratepages = 0, 6272 .nr_migratepages = 0,
6273 .order = -1, 6273 .order = -1,
6274 .zone = page_zone(pfn_to_page(start)), 6274 .zone = page_zone(pfn_to_page(start)),
6275 .mode = MIGRATE_SYNC, 6275 .mode = MIGRATE_SYNC,
6276 .ignore_skip_hint = true, 6276 .ignore_skip_hint = true,
6277 }; 6277 };
6278 INIT_LIST_HEAD(&cc.migratepages); 6278 INIT_LIST_HEAD(&cc.migratepages);
6279 6279
6280 /* 6280 /*
6281 * What we do here is we mark all pageblocks in range as 6281 * What we do here is we mark all pageblocks in range as
6282 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6282 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6283 * have different sizes, and due to the way page allocator 6283 * have different sizes, and due to the way page allocator
6284 * work, we align the range to biggest of the two pages so 6284 * work, we align the range to biggest of the two pages so
6285 * that page allocator won't try to merge buddies from 6285 * that page allocator won't try to merge buddies from
6286 * different pageblocks and change MIGRATE_ISOLATE to some 6286 * different pageblocks and change MIGRATE_ISOLATE to some
6287 * other migration type. 6287 * other migration type.
6288 * 6288 *
6289 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6289 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6290 * migrate the pages from an unaligned range (ie. pages that 6290 * migrate the pages from an unaligned range (ie. pages that
6291 * we are interested in). This will put all the pages in 6291 * we are interested in). This will put all the pages in
6292 * range back to page allocator as MIGRATE_ISOLATE. 6292 * range back to page allocator as MIGRATE_ISOLATE.
6293 * 6293 *
6294 * When this is done, we take the pages in range from page 6294 * When this is done, we take the pages in range from page
6295 * allocator removing them from the buddy system. This way 6295 * allocator removing them from the buddy system. This way
6296 * page allocator will never consider using them. 6296 * page allocator will never consider using them.
6297 * 6297 *
6298 * This lets us mark the pageblocks back as 6298 * This lets us mark the pageblocks back as
6299 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6299 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6300 * aligned range but not in the unaligned, original range are 6300 * aligned range but not in the unaligned, original range are
6301 * put back to page allocator so that buddy can use them. 6301 * put back to page allocator so that buddy can use them.
6302 */ 6302 */
6303 6303
6304 ret = start_isolate_page_range(pfn_max_align_down(start), 6304 ret = start_isolate_page_range(pfn_max_align_down(start),
6305 pfn_max_align_up(end), migratetype, 6305 pfn_max_align_up(end), migratetype,
6306 false); 6306 false);
6307 if (ret) 6307 if (ret)
6308 return ret; 6308 return ret;
6309 6309
6310 ret = __alloc_contig_migrate_range(&cc, start, end); 6310 ret = __alloc_contig_migrate_range(&cc, start, end);
6311 if (ret) 6311 if (ret)
6312 goto done; 6312 goto done;
6313 6313
6314 /* 6314 /*
6315 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6315 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6316 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6316 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6317 * more, all pages in [start, end) are free in page allocator. 6317 * more, all pages in [start, end) are free in page allocator.
6318 * What we are going to do is to allocate all pages from 6318 * What we are going to do is to allocate all pages from
6319 * [start, end) (that is remove them from page allocator). 6319 * [start, end) (that is remove them from page allocator).
6320 * 6320 *
6321 * The only problem is that pages at the beginning and at the 6321 * The only problem is that pages at the beginning and at the
6322 * end of interesting range may be not aligned with pages that 6322 * end of interesting range may be not aligned with pages that
6323 * page allocator holds, ie. they can be part of higher order 6323 * page allocator holds, ie. they can be part of higher order
6324 * pages. Because of this, we reserve the bigger range and 6324 * pages. Because of this, we reserve the bigger range and
6325 * once this is done free the pages we are not interested in. 6325 * once this is done free the pages we are not interested in.
6326 * 6326 *
6327 * We don't have to hold zone->lock here because the pages are 6327 * We don't have to hold zone->lock here because the pages are
6328 * isolated thus they won't get removed from buddy. 6328 * isolated thus they won't get removed from buddy.
6329 */ 6329 */
6330 6330
6331 lru_add_drain_all(); 6331 lru_add_drain_all();
6332 drain_all_pages(); 6332 drain_all_pages();
6333 6333
6334 order = 0; 6334 order = 0;
6335 outer_start = start; 6335 outer_start = start;
6336 while (!PageBuddy(pfn_to_page(outer_start))) { 6336 while (!PageBuddy(pfn_to_page(outer_start))) {
6337 if (++order >= MAX_ORDER) { 6337 if (++order >= MAX_ORDER) {
6338 ret = -EBUSY; 6338 ret = -EBUSY;
6339 goto done; 6339 goto done;
6340 } 6340 }
6341 outer_start &= ~0UL << order; 6341 outer_start &= ~0UL << order;
6342 } 6342 }
6343 6343
6344 /* Make sure the range is really isolated. */ 6344 /* Make sure the range is really isolated. */
6345 if (test_pages_isolated(outer_start, end, false)) { 6345 if (test_pages_isolated(outer_start, end, false)) {
6346 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6346 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6347 outer_start, end); 6347 outer_start, end);
6348 ret = -EBUSY; 6348 ret = -EBUSY;
6349 goto done; 6349 goto done;
6350 } 6350 }
6351 6351
6352 6352
6353 /* Grab isolated pages from freelists. */ 6353 /* Grab isolated pages from freelists. */
6354 outer_end = isolate_freepages_range(&cc, outer_start, end); 6354 outer_end = isolate_freepages_range(&cc, outer_start, end);
6355 if (!outer_end) { 6355 if (!outer_end) {
6356 ret = -EBUSY; 6356 ret = -EBUSY;
6357 goto done; 6357 goto done;
6358 } 6358 }
6359 6359
6360 /* Free head and tail (if any) */ 6360 /* Free head and tail (if any) */
6361 if (start != outer_start) 6361 if (start != outer_start)
6362 free_contig_range(outer_start, start - outer_start); 6362 free_contig_range(outer_start, start - outer_start);
6363 if (end != outer_end) 6363 if (end != outer_end)
6364 free_contig_range(end, outer_end - end); 6364 free_contig_range(end, outer_end - end);
6365 6365
6366 done: 6366 done:
6367 undo_isolate_page_range(pfn_max_align_down(start), 6367 undo_isolate_page_range(pfn_max_align_down(start),
6368 pfn_max_align_up(end), migratetype); 6368 pfn_max_align_up(end), migratetype);
6369 return ret; 6369 return ret;
6370 } 6370 }
6371 6371
6372 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6372 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6373 { 6373 {
6374 unsigned int count = 0; 6374 unsigned int count = 0;
6375 6375
6376 for (; nr_pages--; pfn++) { 6376 for (; nr_pages--; pfn++) {
6377 struct page *page = pfn_to_page(pfn); 6377 struct page *page = pfn_to_page(pfn);
6378 6378
6379 count += page_count(page) != 1; 6379 count += page_count(page) != 1;
6380 __free_page(page); 6380 __free_page(page);
6381 } 6381 }
6382 WARN(count != 0, "%d pages are still in use!\n", count); 6382 WARN(count != 0, "%d pages are still in use!\n", count);
6383 } 6383 }
6384 #endif 6384 #endif
6385 6385
6386 #ifdef CONFIG_MEMORY_HOTPLUG 6386 #ifdef CONFIG_MEMORY_HOTPLUG
6387 /* 6387 /*
6388 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6388 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6389 * page high values need to be recalulated. 6389 * page high values need to be recalulated.
6390 */ 6390 */
6391 void __meminit zone_pcp_update(struct zone *zone) 6391 void __meminit zone_pcp_update(struct zone *zone)
6392 { 6392 {
6393 unsigned cpu; 6393 unsigned cpu;
6394 mutex_lock(&pcp_batch_high_lock); 6394 mutex_lock(&pcp_batch_high_lock);
6395 for_each_possible_cpu(cpu) 6395 for_each_possible_cpu(cpu)
6396 pageset_set_high_and_batch(zone, 6396 pageset_set_high_and_batch(zone,
6397 per_cpu_ptr(zone->pageset, cpu)); 6397 per_cpu_ptr(zone->pageset, cpu));
6398 mutex_unlock(&pcp_batch_high_lock); 6398 mutex_unlock(&pcp_batch_high_lock);
6399 } 6399 }
6400 #endif 6400 #endif
6401 6401
6402 void zone_pcp_reset(struct zone *zone) 6402 void zone_pcp_reset(struct zone *zone)
6403 { 6403 {
6404 unsigned long flags; 6404 unsigned long flags;
6405 int cpu; 6405 int cpu;
6406 struct per_cpu_pageset *pset; 6406 struct per_cpu_pageset *pset;
6407 6407
6408 /* avoid races with drain_pages() */ 6408 /* avoid races with drain_pages() */
6409 local_irq_save(flags); 6409 local_irq_save(flags);
6410 if (zone->pageset != &boot_pageset) { 6410 if (zone->pageset != &boot_pageset) {
6411 for_each_online_cpu(cpu) { 6411 for_each_online_cpu(cpu) {
6412 pset = per_cpu_ptr(zone->pageset, cpu); 6412 pset = per_cpu_ptr(zone->pageset, cpu);
6413 drain_zonestat(zone, pset); 6413 drain_zonestat(zone, pset);
6414 } 6414 }
6415 free_percpu(zone->pageset); 6415 free_percpu(zone->pageset);
6416 zone->pageset = &boot_pageset; 6416 zone->pageset = &boot_pageset;
6417 } 6417 }
6418 local_irq_restore(flags); 6418 local_irq_restore(flags);
6419 } 6419 }
6420 6420
6421 #ifdef CONFIG_MEMORY_HOTREMOVE 6421 #ifdef CONFIG_MEMORY_HOTREMOVE
6422 /* 6422 /*
6423 * All pages in the range must be isolated before calling this. 6423 * All pages in the range must be isolated before calling this.
6424 */ 6424 */
6425 void 6425 void
6426 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6426 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6427 { 6427 {
6428 struct page *page; 6428 struct page *page;
6429 struct zone *zone; 6429 struct zone *zone;
6430 unsigned int order, i; 6430 unsigned int order, i;
6431 unsigned long pfn; 6431 unsigned long pfn;
6432 unsigned long flags; 6432 unsigned long flags;
6433 /* find the first valid pfn */ 6433 /* find the first valid pfn */
6434 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6434 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6435 if (pfn_valid(pfn)) 6435 if (pfn_valid(pfn))
6436 break; 6436 break;
6437 if (pfn == end_pfn) 6437 if (pfn == end_pfn)
6438 return; 6438 return;
6439 zone = page_zone(pfn_to_page(pfn)); 6439 zone = page_zone(pfn_to_page(pfn));
6440 spin_lock_irqsave(&zone->lock, flags); 6440 spin_lock_irqsave(&zone->lock, flags);
6441 pfn = start_pfn; 6441 pfn = start_pfn;
6442 while (pfn < end_pfn) { 6442 while (pfn < end_pfn) {
6443 if (!pfn_valid(pfn)) { 6443 if (!pfn_valid(pfn)) {
6444 pfn++; 6444 pfn++;
6445 continue; 6445 continue;
6446 } 6446 }
6447 page = pfn_to_page(pfn); 6447 page = pfn_to_page(pfn);
6448 /* 6448 /*
6449 * The HWPoisoned page may be not in buddy system, and 6449 * The HWPoisoned page may be not in buddy system, and
6450 * page_count() is not 0. 6450 * page_count() is not 0.
6451 */ 6451 */
6452 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6452 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6453 pfn++; 6453 pfn++;
6454 SetPageReserved(page); 6454 SetPageReserved(page);
6455 continue; 6455 continue;
6456 } 6456 }
6457 6457
6458 BUG_ON(page_count(page)); 6458 BUG_ON(page_count(page));
6459 BUG_ON(!PageBuddy(page)); 6459 BUG_ON(!PageBuddy(page));
6460 order = page_order(page); 6460 order = page_order(page);
6461 #ifdef CONFIG_DEBUG_VM 6461 #ifdef CONFIG_DEBUG_VM
6462 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6462 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6463 pfn, 1 << order, end_pfn); 6463 pfn, 1 << order, end_pfn);
6464 #endif 6464 #endif
6465 list_del(&page->lru); 6465 list_del(&page->lru);
6466 rmv_page_order(page); 6466 rmv_page_order(page);
6467 zone->free_area[order].nr_free--; 6467 zone->free_area[order].nr_free--;
6468 for (i = 0; i < (1 << order); i++) 6468 for (i = 0; i < (1 << order); i++)
6469 SetPageReserved((page+i)); 6469 SetPageReserved((page+i));
6470 pfn += (1 << order); 6470 pfn += (1 << order);
6471 } 6471 }
6472 spin_unlock_irqrestore(&zone->lock, flags); 6472 spin_unlock_irqrestore(&zone->lock, flags);
6473 } 6473 }
6474 #endif 6474 #endif
6475 6475
6476 #ifdef CONFIG_MEMORY_FAILURE 6476 #ifdef CONFIG_MEMORY_FAILURE
6477 bool is_free_buddy_page(struct page *page) 6477 bool is_free_buddy_page(struct page *page)
6478 { 6478 {
6479 struct zone *zone = page_zone(page); 6479 struct zone *zone = page_zone(page);
6480 unsigned long pfn = page_to_pfn(page); 6480 unsigned long pfn = page_to_pfn(page);
6481 unsigned long flags; 6481 unsigned long flags;
6482 unsigned int order; 6482 unsigned int order;
6483 6483
6484 spin_lock_irqsave(&zone->lock, flags); 6484 spin_lock_irqsave(&zone->lock, flags);
6485 for (order = 0; order < MAX_ORDER; order++) { 6485 for (order = 0; order < MAX_ORDER; order++) {
6486 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6486 struct page *page_head = page - (pfn & ((1 << order) - 1));
6487 6487
6488 if (PageBuddy(page_head) && page_order(page_head) >= order) 6488 if (PageBuddy(page_head) && page_order(page_head) >= order)
6489 break; 6489 break;
6490 } 6490 }
6491 spin_unlock_irqrestore(&zone->lock, flags); 6491 spin_unlock_irqrestore(&zone->lock, flags);
6492 6492
6493 return order < MAX_ORDER; 6493 return order < MAX_ORDER;
6494 } 6494 }
6495 #endif 6495 #endif
6496 6496
6497 static const struct trace_print_flags pageflag_names[] = { 6497 static const struct trace_print_flags pageflag_names[] = {
6498 {1UL << PG_locked, "locked" }, 6498 {1UL << PG_locked, "locked" },
6499 {1UL << PG_error, "error" }, 6499 {1UL << PG_error, "error" },
6500 {1UL << PG_referenced, "referenced" }, 6500 {1UL << PG_referenced, "referenced" },
6501 {1UL << PG_uptodate, "uptodate" }, 6501 {1UL << PG_uptodate, "uptodate" },
6502 {1UL << PG_dirty, "dirty" }, 6502 {1UL << PG_dirty, "dirty" },
6503 {1UL << PG_lru, "lru" }, 6503 {1UL << PG_lru, "lru" },
6504 {1UL << PG_active, "active" }, 6504 {1UL << PG_active, "active" },
6505 {1UL << PG_slab, "slab" }, 6505 {1UL << PG_slab, "slab" },
6506 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6506 {1UL << PG_owner_priv_1, "owner_priv_1" },
6507 {1UL << PG_arch_1, "arch_1" }, 6507 {1UL << PG_arch_1, "arch_1" },
6508 {1UL << PG_reserved, "reserved" }, 6508 {1UL << PG_reserved, "reserved" },
6509 {1UL << PG_private, "private" }, 6509 {1UL << PG_private, "private" },
6510 {1UL << PG_private_2, "private_2" }, 6510 {1UL << PG_private_2, "private_2" },
6511 {1UL << PG_writeback, "writeback" }, 6511 {1UL << PG_writeback, "writeback" },
6512 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6512 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6513 {1UL << PG_head, "head" }, 6513 {1UL << PG_head, "head" },
6514 {1UL << PG_tail, "tail" }, 6514 {1UL << PG_tail, "tail" },
6515 #else 6515 #else
6516 {1UL << PG_compound, "compound" }, 6516 {1UL << PG_compound, "compound" },
6517 #endif 6517 #endif
6518 {1UL << PG_swapcache, "swapcache" }, 6518 {1UL << PG_swapcache, "swapcache" },
6519 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6519 {1UL << PG_mappedtodisk, "mappedtodisk" },
6520 {1UL << PG_reclaim, "reclaim" }, 6520 {1UL << PG_reclaim, "reclaim" },
6521 {1UL << PG_swapbacked, "swapbacked" }, 6521 {1UL << PG_swapbacked, "swapbacked" },
6522 {1UL << PG_unevictable, "unevictable" }, 6522 {1UL << PG_unevictable, "unevictable" },
6523 #ifdef CONFIG_MMU 6523 #ifdef CONFIG_MMU
6524 {1UL << PG_mlocked, "mlocked" }, 6524 {1UL << PG_mlocked, "mlocked" },
6525 #endif 6525 #endif
6526 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6526 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6527 {1UL << PG_uncached, "uncached" }, 6527 {1UL << PG_uncached, "uncached" },
6528 #endif 6528 #endif
6529 #ifdef CONFIG_MEMORY_FAILURE 6529 #ifdef CONFIG_MEMORY_FAILURE
6530 {1UL << PG_hwpoison, "hwpoison" }, 6530 {1UL << PG_hwpoison, "hwpoison" },
6531 #endif 6531 #endif
6532 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6532 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6533 {1UL << PG_compound_lock, "compound_lock" }, 6533 {1UL << PG_compound_lock, "compound_lock" },
6534 #endif 6534 #endif
6535 }; 6535 };
6536 6536
6537 static void dump_page_flags(unsigned long flags) 6537 static void dump_page_flags(unsigned long flags)
6538 { 6538 {
6539 const char *delim = ""; 6539 const char *delim = "";
6540 unsigned long mask; 6540 unsigned long mask;
6541 int i; 6541 int i;
6542 6542
6543 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6543 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6544 6544
6545 printk(KERN_ALERT "page flags: %#lx(", flags); 6545 printk(KERN_ALERT "page flags: %#lx(", flags);
6546 6546
6547 /* remove zone id */ 6547 /* remove zone id */
6548 flags &= (1UL << NR_PAGEFLAGS) - 1; 6548 flags &= (1UL << NR_PAGEFLAGS) - 1;
6549 6549
6550 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6550 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6551 6551
6552 mask = pageflag_names[i].mask; 6552 mask = pageflag_names[i].mask;
6553 if ((flags & mask) != mask) 6553 if ((flags & mask) != mask)
6554 continue; 6554 continue;
6555 6555
6556 flags &= ~mask; 6556 flags &= ~mask;
6557 printk("%s%s", delim, pageflag_names[i].name); 6557 printk("%s%s", delim, pageflag_names[i].name);
6558 delim = "|"; 6558 delim = "|";
6559 } 6559 }
6560 6560
6561 /* check for left over flags */ 6561 /* check for left over flags */
6562 if (flags) 6562 if (flags)
6563 printk("%s%#lx", delim, flags); 6563 printk("%s%#lx", delim, flags);
6564 6564
6565 printk(")\n"); 6565 printk(")\n");
6566 } 6566 }
6567 6567
6568 void dump_page(struct page *page) 6568 void dump_page(struct page *page)
6569 { 6569 {
6570 printk(KERN_ALERT 6570 printk(KERN_ALERT
6571 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6571 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6572 page, atomic_read(&page->_count), page_mapcount(page), 6572 page, atomic_read(&page->_count), page_mapcount(page),
6573 page->mapping, page->index); 6573 page->mapping, page->index);
6574 dump_page_flags(page->flags); 6574 dump_page_flags(page->flags);
6575 mem_cgroup_print_bad_page(page); 6575 mem_cgroup_print_bad_page(page);
6576 } 6576 }