Commit dc2786f0c19a779395ef69189dd5e7df2573b29b

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent ee1760b2b4

mm: page_alloc: calculate classzone_idx once from the zonelist ref

commit d8846374a85f4290a473a4e2a64c1ba046c4a0e1 upstream.

There is no need to calculate zone_idx(preferred_zone) multiple times
or use the pgdat to figure it out.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 35 additions and 25 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 #define MIN_PERCPU_PAGELIST_FRACTION (8) 72 #define MIN_PERCPU_PAGELIST_FRACTION (8)
73 73
74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75 DEFINE_PER_CPU(int, numa_node); 75 DEFINE_PER_CPU(int, numa_node);
76 EXPORT_PER_CPU_SYMBOL(numa_node); 76 EXPORT_PER_CPU_SYMBOL(numa_node);
77 #endif 77 #endif
78 78
79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
80 /* 80 /*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>. 84 * defined in <linux/topology.h>.
85 */ 85 */
86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 87 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
88 #endif 88 #endif
89 89
90 /* 90 /*
91 * Array of node states. 91 * Array of node states.
92 */ 92 */
93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
94 [N_POSSIBLE] = NODE_MASK_ALL, 94 [N_POSSIBLE] = NODE_MASK_ALL,
95 [N_ONLINE] = { { [0] = 1UL } }, 95 [N_ONLINE] = { { [0] = 1UL } },
96 #ifndef CONFIG_NUMA 96 #ifndef CONFIG_NUMA
97 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 97 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
98 #ifdef CONFIG_HIGHMEM 98 #ifdef CONFIG_HIGHMEM
99 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 99 [N_HIGH_MEMORY] = { { [0] = 1UL } },
100 #endif 100 #endif
101 #ifdef CONFIG_MOVABLE_NODE 101 #ifdef CONFIG_MOVABLE_NODE
102 [N_MEMORY] = { { [0] = 1UL } }, 102 [N_MEMORY] = { { [0] = 1UL } },
103 #endif 103 #endif
104 [N_CPU] = { { [0] = 1UL } }, 104 [N_CPU] = { { [0] = 1UL } },
105 #endif /* NUMA */ 105 #endif /* NUMA */
106 }; 106 };
107 EXPORT_SYMBOL(node_states); 107 EXPORT_SYMBOL(node_states);
108 108
109 /* Protect totalram_pages and zone->managed_pages */ 109 /* Protect totalram_pages and zone->managed_pages */
110 static DEFINE_SPINLOCK(managed_page_count_lock); 110 static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112 unsigned long totalram_pages __read_mostly; 112 unsigned long totalram_pages __read_mostly;
113 unsigned long totalreserve_pages __read_mostly; 113 unsigned long totalreserve_pages __read_mostly;
114 /* 114 /*
115 * When calculating the number of globally allowed dirty pages, there 115 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 116 * is a certain number of per-zone reserves that should not be
117 * considered dirtyable memory. This is the sum of those reserves 117 * considered dirtyable memory. This is the sum of those reserves
118 * over all existing zones that contribute dirtyable memory. 118 * over all existing zones that contribute dirtyable memory.
119 */ 119 */
120 unsigned long dirty_balance_reserve __read_mostly; 120 unsigned long dirty_balance_reserve __read_mostly;
121 121
122 int percpu_pagelist_fraction; 122 int percpu_pagelist_fraction;
123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
124 124
125 #ifdef CONFIG_PM_SLEEP 125 #ifdef CONFIG_PM_SLEEP
126 /* 126 /*
127 * The following functions are used by the suspend/hibernate code to temporarily 127 * The following functions are used by the suspend/hibernate code to temporarily
128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
129 * while devices are suspended. To avoid races with the suspend/hibernate code, 129 * while devices are suspended. To avoid races with the suspend/hibernate code,
130 * they should always be called with pm_mutex held (gfp_allowed_mask also should 130 * they should always be called with pm_mutex held (gfp_allowed_mask also should
131 * only be modified with pm_mutex held, unless the suspend/hibernate code is 131 * only be modified with pm_mutex held, unless the suspend/hibernate code is
132 * guaranteed not to run in parallel with that modification). 132 * guaranteed not to run in parallel with that modification).
133 */ 133 */
134 134
135 static gfp_t saved_gfp_mask; 135 static gfp_t saved_gfp_mask;
136 136
137 void pm_restore_gfp_mask(void) 137 void pm_restore_gfp_mask(void)
138 { 138 {
139 WARN_ON(!mutex_is_locked(&pm_mutex)); 139 WARN_ON(!mutex_is_locked(&pm_mutex));
140 if (saved_gfp_mask) { 140 if (saved_gfp_mask) {
141 gfp_allowed_mask = saved_gfp_mask; 141 gfp_allowed_mask = saved_gfp_mask;
142 saved_gfp_mask = 0; 142 saved_gfp_mask = 0;
143 } 143 }
144 } 144 }
145 145
146 void pm_restrict_gfp_mask(void) 146 void pm_restrict_gfp_mask(void)
147 { 147 {
148 WARN_ON(!mutex_is_locked(&pm_mutex)); 148 WARN_ON(!mutex_is_locked(&pm_mutex));
149 WARN_ON(saved_gfp_mask); 149 WARN_ON(saved_gfp_mask);
150 saved_gfp_mask = gfp_allowed_mask; 150 saved_gfp_mask = gfp_allowed_mask;
151 gfp_allowed_mask &= ~GFP_IOFS; 151 gfp_allowed_mask &= ~GFP_IOFS;
152 } 152 }
153 153
154 bool pm_suspended_storage(void) 154 bool pm_suspended_storage(void)
155 { 155 {
156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
157 return false; 157 return false;
158 return true; 158 return true;
159 } 159 }
160 #endif /* CONFIG_PM_SLEEP */ 160 #endif /* CONFIG_PM_SLEEP */
161 161
162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
163 int pageblock_order __read_mostly; 163 int pageblock_order __read_mostly;
164 #endif 164 #endif
165 165
166 static void __free_pages_ok(struct page *page, unsigned int order); 166 static void __free_pages_ok(struct page *page, unsigned int order);
167 167
168 /* 168 /*
169 * results with 256, 32 in the lowmem_reserve sysctl: 169 * results with 256, 32 in the lowmem_reserve sysctl:
170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
171 * 1G machine -> (16M dma, 784M normal, 224M high) 171 * 1G machine -> (16M dma, 784M normal, 224M high)
172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
175 * 175 *
176 * TBD: should special case ZONE_DMA32 machines here - in those we normally 176 * TBD: should special case ZONE_DMA32 machines here - in those we normally
177 * don't need any ZONE_NORMAL reservation 177 * don't need any ZONE_NORMAL reservation
178 */ 178 */
179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
180 #ifdef CONFIG_ZONE_DMA 180 #ifdef CONFIG_ZONE_DMA
181 256, 181 256,
182 #endif 182 #endif
183 #ifdef CONFIG_ZONE_DMA32 183 #ifdef CONFIG_ZONE_DMA32
184 256, 184 256,
185 #endif 185 #endif
186 #ifdef CONFIG_HIGHMEM 186 #ifdef CONFIG_HIGHMEM
187 32, 187 32,
188 #endif 188 #endif
189 32, 189 32,
190 }; 190 };
191 191
192 EXPORT_SYMBOL(totalram_pages); 192 EXPORT_SYMBOL(totalram_pages);
193 193
194 static char * const zone_names[MAX_NR_ZONES] = { 194 static char * const zone_names[MAX_NR_ZONES] = {
195 #ifdef CONFIG_ZONE_DMA 195 #ifdef CONFIG_ZONE_DMA
196 "DMA", 196 "DMA",
197 #endif 197 #endif
198 #ifdef CONFIG_ZONE_DMA32 198 #ifdef CONFIG_ZONE_DMA32
199 "DMA32", 199 "DMA32",
200 #endif 200 #endif
201 "Normal", 201 "Normal",
202 #ifdef CONFIG_HIGHMEM 202 #ifdef CONFIG_HIGHMEM
203 "HighMem", 203 "HighMem",
204 #endif 204 #endif
205 "Movable", 205 "Movable",
206 }; 206 };
207 207
208 int min_free_kbytes = 1024; 208 int min_free_kbytes = 1024;
209 int user_min_free_kbytes; 209 int user_min_free_kbytes;
210 210
211 static unsigned long __meminitdata nr_kernel_pages; 211 static unsigned long __meminitdata nr_kernel_pages;
212 static unsigned long __meminitdata nr_all_pages; 212 static unsigned long __meminitdata nr_all_pages;
213 static unsigned long __meminitdata dma_reserve; 213 static unsigned long __meminitdata dma_reserve;
214 214
215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
218 static unsigned long __initdata required_kernelcore; 218 static unsigned long __initdata required_kernelcore;
219 static unsigned long __initdata required_movablecore; 219 static unsigned long __initdata required_movablecore;
220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
221 221
222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
223 int movable_zone; 223 int movable_zone;
224 EXPORT_SYMBOL(movable_zone); 224 EXPORT_SYMBOL(movable_zone);
225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
226 226
227 #if MAX_NUMNODES > 1 227 #if MAX_NUMNODES > 1
228 int nr_node_ids __read_mostly = MAX_NUMNODES; 228 int nr_node_ids __read_mostly = MAX_NUMNODES;
229 int nr_online_nodes __read_mostly = 1; 229 int nr_online_nodes __read_mostly = 1;
230 EXPORT_SYMBOL(nr_node_ids); 230 EXPORT_SYMBOL(nr_node_ids);
231 EXPORT_SYMBOL(nr_online_nodes); 231 EXPORT_SYMBOL(nr_online_nodes);
232 #endif 232 #endif
233 233
234 int page_group_by_mobility_disabled __read_mostly; 234 int page_group_by_mobility_disabled __read_mostly;
235 235
236 void set_pageblock_migratetype(struct page *page, int migratetype) 236 void set_pageblock_migratetype(struct page *page, int migratetype)
237 { 237 {
238 238
239 if (unlikely(page_group_by_mobility_disabled)) 239 if (unlikely(page_group_by_mobility_disabled))
240 migratetype = MIGRATE_UNMOVABLE; 240 migratetype = MIGRATE_UNMOVABLE;
241 241
242 set_pageblock_flags_group(page, (unsigned long)migratetype, 242 set_pageblock_flags_group(page, (unsigned long)migratetype,
243 PB_migrate, PB_migrate_end); 243 PB_migrate, PB_migrate_end);
244 } 244 }
245 245
246 bool oom_killer_disabled __read_mostly; 246 bool oom_killer_disabled __read_mostly;
247 247
248 #ifdef CONFIG_DEBUG_VM 248 #ifdef CONFIG_DEBUG_VM
249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
250 { 250 {
251 int ret = 0; 251 int ret = 0;
252 unsigned seq; 252 unsigned seq;
253 unsigned long pfn = page_to_pfn(page); 253 unsigned long pfn = page_to_pfn(page);
254 unsigned long sp, start_pfn; 254 unsigned long sp, start_pfn;
255 255
256 do { 256 do {
257 seq = zone_span_seqbegin(zone); 257 seq = zone_span_seqbegin(zone);
258 start_pfn = zone->zone_start_pfn; 258 start_pfn = zone->zone_start_pfn;
259 sp = zone->spanned_pages; 259 sp = zone->spanned_pages;
260 if (!zone_spans_pfn(zone, pfn)) 260 if (!zone_spans_pfn(zone, pfn))
261 ret = 1; 261 ret = 1;
262 } while (zone_span_seqretry(zone, seq)); 262 } while (zone_span_seqretry(zone, seq));
263 263
264 if (ret) 264 if (ret)
265 pr_err("page %lu outside zone [ %lu - %lu ]\n", 265 pr_err("page %lu outside zone [ %lu - %lu ]\n",
266 pfn, start_pfn, start_pfn + sp); 266 pfn, start_pfn, start_pfn + sp);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 static int page_is_consistent(struct zone *zone, struct page *page) 271 static int page_is_consistent(struct zone *zone, struct page *page)
272 { 272 {
273 if (!pfn_valid_within(page_to_pfn(page))) 273 if (!pfn_valid_within(page_to_pfn(page)))
274 return 0; 274 return 0;
275 if (zone != page_zone(page)) 275 if (zone != page_zone(page))
276 return 0; 276 return 0;
277 277
278 return 1; 278 return 1;
279 } 279 }
280 /* 280 /*
281 * Temporary debugging check for pages not lying within a given zone. 281 * Temporary debugging check for pages not lying within a given zone.
282 */ 282 */
283 static int bad_range(struct zone *zone, struct page *page) 283 static int bad_range(struct zone *zone, struct page *page)
284 { 284 {
285 if (page_outside_zone_boundaries(zone, page)) 285 if (page_outside_zone_boundaries(zone, page))
286 return 1; 286 return 1;
287 if (!page_is_consistent(zone, page)) 287 if (!page_is_consistent(zone, page))
288 return 1; 288 return 1;
289 289
290 return 0; 290 return 0;
291 } 291 }
292 #else 292 #else
293 static inline int bad_range(struct zone *zone, struct page *page) 293 static inline int bad_range(struct zone *zone, struct page *page)
294 { 294 {
295 return 0; 295 return 0;
296 } 296 }
297 #endif 297 #endif
298 298
299 static void bad_page(struct page *page) 299 static void bad_page(struct page *page)
300 { 300 {
301 static unsigned long resume; 301 static unsigned long resume;
302 static unsigned long nr_shown; 302 static unsigned long nr_shown;
303 static unsigned long nr_unshown; 303 static unsigned long nr_unshown;
304 304
305 /* Don't complain about poisoned pages */ 305 /* Don't complain about poisoned pages */
306 if (PageHWPoison(page)) { 306 if (PageHWPoison(page)) {
307 page_mapcount_reset(page); /* remove PageBuddy */ 307 page_mapcount_reset(page); /* remove PageBuddy */
308 return; 308 return;
309 } 309 }
310 310
311 /* 311 /*
312 * Allow a burst of 60 reports, then keep quiet for that minute; 312 * Allow a burst of 60 reports, then keep quiet for that minute;
313 * or allow a steady drip of one report per second. 313 * or allow a steady drip of one report per second.
314 */ 314 */
315 if (nr_shown == 60) { 315 if (nr_shown == 60) {
316 if (time_before(jiffies, resume)) { 316 if (time_before(jiffies, resume)) {
317 nr_unshown++; 317 nr_unshown++;
318 goto out; 318 goto out;
319 } 319 }
320 if (nr_unshown) { 320 if (nr_unshown) {
321 printk(KERN_ALERT 321 printk(KERN_ALERT
322 "BUG: Bad page state: %lu messages suppressed\n", 322 "BUG: Bad page state: %lu messages suppressed\n",
323 nr_unshown); 323 nr_unshown);
324 nr_unshown = 0; 324 nr_unshown = 0;
325 } 325 }
326 nr_shown = 0; 326 nr_shown = 0;
327 } 327 }
328 if (nr_shown++ == 0) 328 if (nr_shown++ == 0)
329 resume = jiffies + 60 * HZ; 329 resume = jiffies + 60 * HZ;
330 330
331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
332 current->comm, page_to_pfn(page)); 332 current->comm, page_to_pfn(page));
333 dump_page(page); 333 dump_page(page);
334 334
335 print_modules(); 335 print_modules();
336 dump_stack(); 336 dump_stack();
337 out: 337 out:
338 /* Leave bad fields for debug, except PageBuddy could make trouble */ 338 /* Leave bad fields for debug, except PageBuddy could make trouble */
339 page_mapcount_reset(page); /* remove PageBuddy */ 339 page_mapcount_reset(page); /* remove PageBuddy */
340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
341 } 341 }
342 342
343 /* 343 /*
344 * Higher-order pages are called "compound pages". They are structured thusly: 344 * Higher-order pages are called "compound pages". They are structured thusly:
345 * 345 *
346 * The first PAGE_SIZE page is called the "head page". 346 * The first PAGE_SIZE page is called the "head page".
347 * 347 *
348 * The remaining PAGE_SIZE pages are called "tail pages". 348 * The remaining PAGE_SIZE pages are called "tail pages".
349 * 349 *
350 * All pages have PG_compound set. All tail pages have their ->first_page 350 * All pages have PG_compound set. All tail pages have their ->first_page
351 * pointing at the head page. 351 * pointing at the head page.
352 * 352 *
353 * The first tail page's ->lru.next holds the address of the compound page's 353 * The first tail page's ->lru.next holds the address of the compound page's
354 * put_page() function. Its ->lru.prev holds the order of allocation. 354 * put_page() function. Its ->lru.prev holds the order of allocation.
355 * This usage means that zero-order pages may not be compound. 355 * This usage means that zero-order pages may not be compound.
356 */ 356 */
357 357
358 static void free_compound_page(struct page *page) 358 static void free_compound_page(struct page *page)
359 { 359 {
360 __free_pages_ok(page, compound_order(page)); 360 __free_pages_ok(page, compound_order(page));
361 } 361 }
362 362
363 void prep_compound_page(struct page *page, unsigned long order) 363 void prep_compound_page(struct page *page, unsigned long order)
364 { 364 {
365 int i; 365 int i;
366 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
367 367
368 set_compound_page_dtor(page, free_compound_page); 368 set_compound_page_dtor(page, free_compound_page);
369 set_compound_order(page, order); 369 set_compound_order(page, order);
370 __SetPageHead(page); 370 __SetPageHead(page);
371 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
372 struct page *p = page + i; 372 struct page *p = page + i;
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 /* Make sure p->first_page is always valid for PageTail() */ 375 /* Make sure p->first_page is always valid for PageTail() */
376 smp_wmb(); 376 smp_wmb();
377 __SetPageTail(p); 377 __SetPageTail(p);
378 } 378 }
379 } 379 }
380 380
381 /* update __split_huge_page_refcount if you change this function */ 381 /* update __split_huge_page_refcount if you change this function */
382 static int destroy_compound_page(struct page *page, unsigned long order) 382 static int destroy_compound_page(struct page *page, unsigned long order)
383 { 383 {
384 int i; 384 int i;
385 int nr_pages = 1 << order; 385 int nr_pages = 1 << order;
386 int bad = 0; 386 int bad = 0;
387 387
388 if (unlikely(compound_order(page) != order)) { 388 if (unlikely(compound_order(page) != order)) {
389 bad_page(page); 389 bad_page(page);
390 bad++; 390 bad++;
391 } 391 }
392 392
393 __ClearPageHead(page); 393 __ClearPageHead(page);
394 394
395 for (i = 1; i < nr_pages; i++) { 395 for (i = 1; i < nr_pages; i++) {
396 struct page *p = page + i; 396 struct page *p = page + i;
397 397
398 if (unlikely(!PageTail(p) || (p->first_page != page))) { 398 if (unlikely(!PageTail(p) || (p->first_page != page))) {
399 bad_page(page); 399 bad_page(page);
400 bad++; 400 bad++;
401 } 401 }
402 __ClearPageTail(p); 402 __ClearPageTail(p);
403 } 403 }
404 404
405 return bad; 405 return bad;
406 } 406 }
407 407
408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
409 { 409 {
410 int i; 410 int i;
411 411
412 /* 412 /*
413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
414 * and __GFP_HIGHMEM from hard or soft interrupt context. 414 * and __GFP_HIGHMEM from hard or soft interrupt context.
415 */ 415 */
416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
417 for (i = 0; i < (1 << order); i++) 417 for (i = 0; i < (1 << order); i++)
418 clear_highpage(page + i); 418 clear_highpage(page + i);
419 } 419 }
420 420
421 #ifdef CONFIG_DEBUG_PAGEALLOC 421 #ifdef CONFIG_DEBUG_PAGEALLOC
422 unsigned int _debug_guardpage_minorder; 422 unsigned int _debug_guardpage_minorder;
423 423
424 static int __init debug_guardpage_minorder_setup(char *buf) 424 static int __init debug_guardpage_minorder_setup(char *buf)
425 { 425 {
426 unsigned long res; 426 unsigned long res;
427 427
428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
430 return 0; 430 return 0;
431 } 431 }
432 _debug_guardpage_minorder = res; 432 _debug_guardpage_minorder = res;
433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
434 return 0; 434 return 0;
435 } 435 }
436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
437 437
438 static inline void set_page_guard_flag(struct page *page) 438 static inline void set_page_guard_flag(struct page *page)
439 { 439 {
440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
441 } 441 }
442 442
443 static inline void clear_page_guard_flag(struct page *page) 443 static inline void clear_page_guard_flag(struct page *page)
444 { 444 {
445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
446 } 446 }
447 #else 447 #else
448 static inline void set_page_guard_flag(struct page *page) { } 448 static inline void set_page_guard_flag(struct page *page) { }
449 static inline void clear_page_guard_flag(struct page *page) { } 449 static inline void clear_page_guard_flag(struct page *page) { }
450 #endif 450 #endif
451 451
452 static inline void set_page_order(struct page *page, int order) 452 static inline void set_page_order(struct page *page, int order)
453 { 453 {
454 set_page_private(page, order); 454 set_page_private(page, order);
455 __SetPageBuddy(page); 455 __SetPageBuddy(page);
456 } 456 }
457 457
458 static inline void rmv_page_order(struct page *page) 458 static inline void rmv_page_order(struct page *page)
459 { 459 {
460 __ClearPageBuddy(page); 460 __ClearPageBuddy(page);
461 set_page_private(page, 0); 461 set_page_private(page, 0);
462 } 462 }
463 463
464 /* 464 /*
465 * Locate the struct page for both the matching buddy in our 465 * Locate the struct page for both the matching buddy in our
466 * pair (buddy1) and the combined O(n+1) page they form (page). 466 * pair (buddy1) and the combined O(n+1) page they form (page).
467 * 467 *
468 * 1) Any buddy B1 will have an order O twin B2 which satisfies 468 * 1) Any buddy B1 will have an order O twin B2 which satisfies
469 * the following equation: 469 * the following equation:
470 * B2 = B1 ^ (1 << O) 470 * B2 = B1 ^ (1 << O)
471 * For example, if the starting buddy (buddy2) is #8 its order 471 * For example, if the starting buddy (buddy2) is #8 its order
472 * 1 buddy is #10: 472 * 1 buddy is #10:
473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
474 * 474 *
475 * 2) Any buddy B will have an order O+1 parent P which 475 * 2) Any buddy B will have an order O+1 parent P which
476 * satisfies the following equation: 476 * satisfies the following equation:
477 * P = B & ~(1 << O) 477 * P = B & ~(1 << O)
478 * 478 *
479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
480 */ 480 */
481 static inline unsigned long 481 static inline unsigned long
482 __find_buddy_index(unsigned long page_idx, unsigned int order) 482 __find_buddy_index(unsigned long page_idx, unsigned int order)
483 { 483 {
484 return page_idx ^ (1 << order); 484 return page_idx ^ (1 << order);
485 } 485 }
486 486
487 /* 487 /*
488 * This function checks whether a page is free && is the buddy 488 * This function checks whether a page is free && is the buddy
489 * we can do coalesce a page and its buddy if 489 * we can do coalesce a page and its buddy if
490 * (a) the buddy is not in a hole && 490 * (a) the buddy is not in a hole &&
491 * (b) the buddy is in the buddy system && 491 * (b) the buddy is in the buddy system &&
492 * (c) a page and its buddy have the same order && 492 * (c) a page and its buddy have the same order &&
493 * (d) a page and its buddy are in the same zone. 493 * (d) a page and its buddy are in the same zone.
494 * 494 *
495 * For recording whether a page is in the buddy system, we set ->_mapcount 495 * For recording whether a page is in the buddy system, we set ->_mapcount
496 * PAGE_BUDDY_MAPCOUNT_VALUE. 496 * PAGE_BUDDY_MAPCOUNT_VALUE.
497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
498 * serialized by zone->lock. 498 * serialized by zone->lock.
499 * 499 *
500 * For recording page's order, we use page_private(page). 500 * For recording page's order, we use page_private(page).
501 */ 501 */
502 static inline int page_is_buddy(struct page *page, struct page *buddy, 502 static inline int page_is_buddy(struct page *page, struct page *buddy,
503 int order) 503 int order)
504 { 504 {
505 if (!pfn_valid_within(page_to_pfn(buddy))) 505 if (!pfn_valid_within(page_to_pfn(buddy)))
506 return 0; 506 return 0;
507 507
508 if (page_zone_id(page) != page_zone_id(buddy)) 508 if (page_zone_id(page) != page_zone_id(buddy))
509 return 0; 509 return 0;
510 510
511 if (page_is_guard(buddy) && page_order(buddy) == order) { 511 if (page_is_guard(buddy) && page_order(buddy) == order) {
512 VM_BUG_ON(page_count(buddy) != 0); 512 VM_BUG_ON(page_count(buddy) != 0);
513 return 1; 513 return 1;
514 } 514 }
515 515
516 if (PageBuddy(buddy) && page_order(buddy) == order) { 516 if (PageBuddy(buddy) && page_order(buddy) == order) {
517 VM_BUG_ON(page_count(buddy) != 0); 517 VM_BUG_ON(page_count(buddy) != 0);
518 return 1; 518 return 1;
519 } 519 }
520 return 0; 520 return 0;
521 } 521 }
522 522
523 /* 523 /*
524 * Freeing function for a buddy system allocator. 524 * Freeing function for a buddy system allocator.
525 * 525 *
526 * The concept of a buddy system is to maintain direct-mapped table 526 * The concept of a buddy system is to maintain direct-mapped table
527 * (containing bit values) for memory blocks of various "orders". 527 * (containing bit values) for memory blocks of various "orders".
528 * The bottom level table contains the map for the smallest allocatable 528 * The bottom level table contains the map for the smallest allocatable
529 * units of memory (here, pages), and each level above it describes 529 * units of memory (here, pages), and each level above it describes
530 * pairs of units from the levels below, hence, "buddies". 530 * pairs of units from the levels below, hence, "buddies".
531 * At a high level, all that happens here is marking the table entry 531 * At a high level, all that happens here is marking the table entry
532 * at the bottom level available, and propagating the changes upward 532 * at the bottom level available, and propagating the changes upward
533 * as necessary, plus some accounting needed to play nicely with other 533 * as necessary, plus some accounting needed to play nicely with other
534 * parts of the VM system. 534 * parts of the VM system.
535 * At each level, we keep a list of pages, which are heads of continuous 535 * At each level, we keep a list of pages, which are heads of continuous
536 * free pages of length of (1 << order) and marked with _mapcount 536 * free pages of length of (1 << order) and marked with _mapcount
537 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 537 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
538 * field. 538 * field.
539 * So when we are allocating or freeing one, we can derive the state of the 539 * So when we are allocating or freeing one, we can derive the state of the
540 * other. That is, if we allocate a small block, and both were 540 * other. That is, if we allocate a small block, and both were
541 * free, the remainder of the region must be split into blocks. 541 * free, the remainder of the region must be split into blocks.
542 * If a block is freed, and its buddy is also free, then this 542 * If a block is freed, and its buddy is also free, then this
543 * triggers coalescing into a block of larger size. 543 * triggers coalescing into a block of larger size.
544 * 544 *
545 * -- nyc 545 * -- nyc
546 */ 546 */
547 547
548 static inline void __free_one_page(struct page *page, 548 static inline void __free_one_page(struct page *page,
549 struct zone *zone, unsigned int order, 549 struct zone *zone, unsigned int order,
550 int migratetype) 550 int migratetype)
551 { 551 {
552 unsigned long page_idx; 552 unsigned long page_idx;
553 unsigned long combined_idx; 553 unsigned long combined_idx;
554 unsigned long uninitialized_var(buddy_idx); 554 unsigned long uninitialized_var(buddy_idx);
555 struct page *buddy; 555 struct page *buddy;
556 556
557 VM_BUG_ON(!zone_is_initialized(zone)); 557 VM_BUG_ON(!zone_is_initialized(zone));
558 558
559 if (unlikely(PageCompound(page))) 559 if (unlikely(PageCompound(page)))
560 if (unlikely(destroy_compound_page(page, order))) 560 if (unlikely(destroy_compound_page(page, order)))
561 return; 561 return;
562 562
563 VM_BUG_ON(migratetype == -1); 563 VM_BUG_ON(migratetype == -1);
564 564
565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
566 566
567 VM_BUG_ON(page_idx & ((1 << order) - 1)); 567 VM_BUG_ON(page_idx & ((1 << order) - 1));
568 VM_BUG_ON(bad_range(zone, page)); 568 VM_BUG_ON(bad_range(zone, page));
569 569
570 while (order < MAX_ORDER-1) { 570 while (order < MAX_ORDER-1) {
571 buddy_idx = __find_buddy_index(page_idx, order); 571 buddy_idx = __find_buddy_index(page_idx, order);
572 buddy = page + (buddy_idx - page_idx); 572 buddy = page + (buddy_idx - page_idx);
573 if (!page_is_buddy(page, buddy, order)) 573 if (!page_is_buddy(page, buddy, order))
574 break; 574 break;
575 /* 575 /*
576 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 576 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
577 * merge with it and move up one order. 577 * merge with it and move up one order.
578 */ 578 */
579 if (page_is_guard(buddy)) { 579 if (page_is_guard(buddy)) {
580 clear_page_guard_flag(buddy); 580 clear_page_guard_flag(buddy);
581 set_page_private(page, 0); 581 set_page_private(page, 0);
582 __mod_zone_freepage_state(zone, 1 << order, 582 __mod_zone_freepage_state(zone, 1 << order,
583 migratetype); 583 migratetype);
584 } else { 584 } else {
585 list_del(&buddy->lru); 585 list_del(&buddy->lru);
586 zone->free_area[order].nr_free--; 586 zone->free_area[order].nr_free--;
587 rmv_page_order(buddy); 587 rmv_page_order(buddy);
588 } 588 }
589 combined_idx = buddy_idx & page_idx; 589 combined_idx = buddy_idx & page_idx;
590 page = page + (combined_idx - page_idx); 590 page = page + (combined_idx - page_idx);
591 page_idx = combined_idx; 591 page_idx = combined_idx;
592 order++; 592 order++;
593 } 593 }
594 set_page_order(page, order); 594 set_page_order(page, order);
595 595
596 /* 596 /*
597 * If this is not the largest possible page, check if the buddy 597 * If this is not the largest possible page, check if the buddy
598 * of the next-highest order is free. If it is, it's possible 598 * of the next-highest order is free. If it is, it's possible
599 * that pages are being freed that will coalesce soon. In case, 599 * that pages are being freed that will coalesce soon. In case,
600 * that is happening, add the free page to the tail of the list 600 * that is happening, add the free page to the tail of the list
601 * so it's less likely to be used soon and more likely to be merged 601 * so it's less likely to be used soon and more likely to be merged
602 * as a higher order page 602 * as a higher order page
603 */ 603 */
604 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 604 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
605 struct page *higher_page, *higher_buddy; 605 struct page *higher_page, *higher_buddy;
606 combined_idx = buddy_idx & page_idx; 606 combined_idx = buddy_idx & page_idx;
607 higher_page = page + (combined_idx - page_idx); 607 higher_page = page + (combined_idx - page_idx);
608 buddy_idx = __find_buddy_index(combined_idx, order + 1); 608 buddy_idx = __find_buddy_index(combined_idx, order + 1);
609 higher_buddy = higher_page + (buddy_idx - combined_idx); 609 higher_buddy = higher_page + (buddy_idx - combined_idx);
610 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 610 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
611 list_add_tail(&page->lru, 611 list_add_tail(&page->lru,
612 &zone->free_area[order].free_list[migratetype]); 612 &zone->free_area[order].free_list[migratetype]);
613 goto out; 613 goto out;
614 } 614 }
615 } 615 }
616 616
617 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 617 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
618 out: 618 out:
619 zone->free_area[order].nr_free++; 619 zone->free_area[order].nr_free++;
620 } 620 }
621 621
622 static inline int free_pages_check(struct page *page) 622 static inline int free_pages_check(struct page *page)
623 { 623 {
624 if (unlikely(page_mapcount(page) | 624 if (unlikely(page_mapcount(page) |
625 (page->mapping != NULL) | 625 (page->mapping != NULL) |
626 (atomic_read(&page->_count) != 0) | 626 (atomic_read(&page->_count) != 0) |
627 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 627 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
628 (mem_cgroup_bad_page_check(page)))) { 628 (mem_cgroup_bad_page_check(page)))) {
629 bad_page(page); 629 bad_page(page);
630 return 1; 630 return 1;
631 } 631 }
632 page_nid_reset_last(page); 632 page_nid_reset_last(page);
633 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 633 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
634 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 634 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
635 return 0; 635 return 0;
636 } 636 }
637 637
638 /* 638 /*
639 * Frees a number of pages from the PCP lists 639 * Frees a number of pages from the PCP lists
640 * Assumes all pages on list are in same zone, and of same order. 640 * Assumes all pages on list are in same zone, and of same order.
641 * count is the number of pages to free. 641 * count is the number of pages to free.
642 * 642 *
643 * If the zone was previously in an "all pages pinned" state then look to 643 * If the zone was previously in an "all pages pinned" state then look to
644 * see if this freeing clears that state. 644 * see if this freeing clears that state.
645 * 645 *
646 * And clear the zone's pages_scanned counter, to hold off the "all pages are 646 * And clear the zone's pages_scanned counter, to hold off the "all pages are
647 * pinned" detection logic. 647 * pinned" detection logic.
648 */ 648 */
649 static void free_pcppages_bulk(struct zone *zone, int count, 649 static void free_pcppages_bulk(struct zone *zone, int count,
650 struct per_cpu_pages *pcp) 650 struct per_cpu_pages *pcp)
651 { 651 {
652 int migratetype = 0; 652 int migratetype = 0;
653 int batch_free = 0; 653 int batch_free = 0;
654 int to_free = count; 654 int to_free = count;
655 655
656 spin_lock(&zone->lock); 656 spin_lock(&zone->lock);
657 zone->pages_scanned = 0; 657 zone->pages_scanned = 0;
658 658
659 while (to_free) { 659 while (to_free) {
660 struct page *page; 660 struct page *page;
661 struct list_head *list; 661 struct list_head *list;
662 662
663 /* 663 /*
664 * Remove pages from lists in a round-robin fashion. A 664 * Remove pages from lists in a round-robin fashion. A
665 * batch_free count is maintained that is incremented when an 665 * batch_free count is maintained that is incremented when an
666 * empty list is encountered. This is so more pages are freed 666 * empty list is encountered. This is so more pages are freed
667 * off fuller lists instead of spinning excessively around empty 667 * off fuller lists instead of spinning excessively around empty
668 * lists 668 * lists
669 */ 669 */
670 do { 670 do {
671 batch_free++; 671 batch_free++;
672 if (++migratetype == MIGRATE_PCPTYPES) 672 if (++migratetype == MIGRATE_PCPTYPES)
673 migratetype = 0; 673 migratetype = 0;
674 list = &pcp->lists[migratetype]; 674 list = &pcp->lists[migratetype];
675 } while (list_empty(list)); 675 } while (list_empty(list));
676 676
677 /* This is the only non-empty list. Free them all. */ 677 /* This is the only non-empty list. Free them all. */
678 if (batch_free == MIGRATE_PCPTYPES) 678 if (batch_free == MIGRATE_PCPTYPES)
679 batch_free = to_free; 679 batch_free = to_free;
680 680
681 do { 681 do {
682 int mt; /* migratetype of the to-be-freed page */ 682 int mt; /* migratetype of the to-be-freed page */
683 683
684 page = list_entry(list->prev, struct page, lru); 684 page = list_entry(list->prev, struct page, lru);
685 /* must delete as __free_one_page list manipulates */ 685 /* must delete as __free_one_page list manipulates */
686 list_del(&page->lru); 686 list_del(&page->lru);
687 mt = get_freepage_migratetype(page); 687 mt = get_freepage_migratetype(page);
688 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 688 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
689 __free_one_page(page, zone, 0, mt); 689 __free_one_page(page, zone, 0, mt);
690 trace_mm_page_pcpu_drain(page, 0, mt); 690 trace_mm_page_pcpu_drain(page, 0, mt);
691 if (likely(!is_migrate_isolate_page(page))) { 691 if (likely(!is_migrate_isolate_page(page))) {
692 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 692 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
693 if (is_migrate_cma(mt)) 693 if (is_migrate_cma(mt))
694 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 694 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
695 } 695 }
696 } while (--to_free && --batch_free && !list_empty(list)); 696 } while (--to_free && --batch_free && !list_empty(list));
697 } 697 }
698 spin_unlock(&zone->lock); 698 spin_unlock(&zone->lock);
699 } 699 }
700 700
701 static void free_one_page(struct zone *zone, struct page *page, int order, 701 static void free_one_page(struct zone *zone, struct page *page, int order,
702 int migratetype) 702 int migratetype)
703 { 703 {
704 spin_lock(&zone->lock); 704 spin_lock(&zone->lock);
705 zone->pages_scanned = 0; 705 zone->pages_scanned = 0;
706 706
707 __free_one_page(page, zone, order, migratetype); 707 __free_one_page(page, zone, order, migratetype);
708 if (unlikely(!is_migrate_isolate(migratetype))) 708 if (unlikely(!is_migrate_isolate(migratetype)))
709 __mod_zone_freepage_state(zone, 1 << order, migratetype); 709 __mod_zone_freepage_state(zone, 1 << order, migratetype);
710 spin_unlock(&zone->lock); 710 spin_unlock(&zone->lock);
711 } 711 }
712 712
713 static bool free_pages_prepare(struct page *page, unsigned int order) 713 static bool free_pages_prepare(struct page *page, unsigned int order)
714 { 714 {
715 int i; 715 int i;
716 int bad = 0; 716 int bad = 0;
717 717
718 trace_mm_page_free(page, order); 718 trace_mm_page_free(page, order);
719 kmemcheck_free_shadow(page, order); 719 kmemcheck_free_shadow(page, order);
720 720
721 if (PageAnon(page)) 721 if (PageAnon(page))
722 page->mapping = NULL; 722 page->mapping = NULL;
723 for (i = 0; i < (1 << order); i++) 723 for (i = 0; i < (1 << order); i++)
724 bad += free_pages_check(page + i); 724 bad += free_pages_check(page + i);
725 if (bad) 725 if (bad)
726 return false; 726 return false;
727 727
728 if (!PageHighMem(page)) { 728 if (!PageHighMem(page)) {
729 debug_check_no_locks_freed(page_address(page), 729 debug_check_no_locks_freed(page_address(page),
730 PAGE_SIZE << order); 730 PAGE_SIZE << order);
731 debug_check_no_obj_freed(page_address(page), 731 debug_check_no_obj_freed(page_address(page),
732 PAGE_SIZE << order); 732 PAGE_SIZE << order);
733 } 733 }
734 arch_free_page(page, order); 734 arch_free_page(page, order);
735 kernel_map_pages(page, 1 << order, 0); 735 kernel_map_pages(page, 1 << order, 0);
736 736
737 return true; 737 return true;
738 } 738 }
739 739
740 static void __free_pages_ok(struct page *page, unsigned int order) 740 static void __free_pages_ok(struct page *page, unsigned int order)
741 { 741 {
742 unsigned long flags; 742 unsigned long flags;
743 int migratetype; 743 int migratetype;
744 744
745 if (!free_pages_prepare(page, order)) 745 if (!free_pages_prepare(page, order))
746 return; 746 return;
747 747
748 local_irq_save(flags); 748 local_irq_save(flags);
749 __count_vm_events(PGFREE, 1 << order); 749 __count_vm_events(PGFREE, 1 << order);
750 migratetype = get_pageblock_migratetype(page); 750 migratetype = get_pageblock_migratetype(page);
751 set_freepage_migratetype(page, migratetype); 751 set_freepage_migratetype(page, migratetype);
752 free_one_page(page_zone(page), page, order, migratetype); 752 free_one_page(page_zone(page), page, order, migratetype);
753 local_irq_restore(flags); 753 local_irq_restore(flags);
754 } 754 }
755 755
756 void __init __free_pages_bootmem(struct page *page, unsigned int order) 756 void __init __free_pages_bootmem(struct page *page, unsigned int order)
757 { 757 {
758 unsigned int nr_pages = 1 << order; 758 unsigned int nr_pages = 1 << order;
759 struct page *p = page; 759 struct page *p = page;
760 unsigned int loop; 760 unsigned int loop;
761 761
762 prefetchw(p); 762 prefetchw(p);
763 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 763 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
764 prefetchw(p + 1); 764 prefetchw(p + 1);
765 __ClearPageReserved(p); 765 __ClearPageReserved(p);
766 set_page_count(p, 0); 766 set_page_count(p, 0);
767 } 767 }
768 __ClearPageReserved(p); 768 __ClearPageReserved(p);
769 set_page_count(p, 0); 769 set_page_count(p, 0);
770 770
771 page_zone(page)->managed_pages += nr_pages; 771 page_zone(page)->managed_pages += nr_pages;
772 set_page_refcounted(page); 772 set_page_refcounted(page);
773 __free_pages(page, order); 773 __free_pages(page, order);
774 } 774 }
775 775
776 #ifdef CONFIG_CMA 776 #ifdef CONFIG_CMA
777 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 777 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
778 void __init init_cma_reserved_pageblock(struct page *page) 778 void __init init_cma_reserved_pageblock(struct page *page)
779 { 779 {
780 unsigned i = pageblock_nr_pages; 780 unsigned i = pageblock_nr_pages;
781 struct page *p = page; 781 struct page *p = page;
782 782
783 do { 783 do {
784 __ClearPageReserved(p); 784 __ClearPageReserved(p);
785 set_page_count(p, 0); 785 set_page_count(p, 0);
786 } while (++p, --i); 786 } while (++p, --i);
787 787
788 set_pageblock_migratetype(page, MIGRATE_CMA); 788 set_pageblock_migratetype(page, MIGRATE_CMA);
789 789
790 if (pageblock_order >= MAX_ORDER) { 790 if (pageblock_order >= MAX_ORDER) {
791 i = pageblock_nr_pages; 791 i = pageblock_nr_pages;
792 p = page; 792 p = page;
793 do { 793 do {
794 set_page_refcounted(p); 794 set_page_refcounted(p);
795 __free_pages(p, MAX_ORDER - 1); 795 __free_pages(p, MAX_ORDER - 1);
796 p += MAX_ORDER_NR_PAGES; 796 p += MAX_ORDER_NR_PAGES;
797 } while (i -= MAX_ORDER_NR_PAGES); 797 } while (i -= MAX_ORDER_NR_PAGES);
798 } else { 798 } else {
799 set_page_refcounted(page); 799 set_page_refcounted(page);
800 __free_pages(page, pageblock_order); 800 __free_pages(page, pageblock_order);
801 } 801 }
802 802
803 adjust_managed_page_count(page, pageblock_nr_pages); 803 adjust_managed_page_count(page, pageblock_nr_pages);
804 } 804 }
805 #endif 805 #endif
806 806
807 /* 807 /*
808 * The order of subdivision here is critical for the IO subsystem. 808 * The order of subdivision here is critical for the IO subsystem.
809 * Please do not alter this order without good reasons and regression 809 * Please do not alter this order without good reasons and regression
810 * testing. Specifically, as large blocks of memory are subdivided, 810 * testing. Specifically, as large blocks of memory are subdivided,
811 * the order in which smaller blocks are delivered depends on the order 811 * the order in which smaller blocks are delivered depends on the order
812 * they're subdivided in this function. This is the primary factor 812 * they're subdivided in this function. This is the primary factor
813 * influencing the order in which pages are delivered to the IO 813 * influencing the order in which pages are delivered to the IO
814 * subsystem according to empirical testing, and this is also justified 814 * subsystem according to empirical testing, and this is also justified
815 * by considering the behavior of a buddy system containing a single 815 * by considering the behavior of a buddy system containing a single
816 * large block of memory acted on by a series of small allocations. 816 * large block of memory acted on by a series of small allocations.
817 * This behavior is a critical factor in sglist merging's success. 817 * This behavior is a critical factor in sglist merging's success.
818 * 818 *
819 * -- nyc 819 * -- nyc
820 */ 820 */
821 static inline void expand(struct zone *zone, struct page *page, 821 static inline void expand(struct zone *zone, struct page *page,
822 int low, int high, struct free_area *area, 822 int low, int high, struct free_area *area,
823 int migratetype) 823 int migratetype)
824 { 824 {
825 unsigned long size = 1 << high; 825 unsigned long size = 1 << high;
826 826
827 while (high > low) { 827 while (high > low) {
828 area--; 828 area--;
829 high--; 829 high--;
830 size >>= 1; 830 size >>= 1;
831 VM_BUG_ON(bad_range(zone, &page[size])); 831 VM_BUG_ON(bad_range(zone, &page[size]));
832 832
833 #ifdef CONFIG_DEBUG_PAGEALLOC 833 #ifdef CONFIG_DEBUG_PAGEALLOC
834 if (high < debug_guardpage_minorder()) { 834 if (high < debug_guardpage_minorder()) {
835 /* 835 /*
836 * Mark as guard pages (or page), that will allow to 836 * Mark as guard pages (or page), that will allow to
837 * merge back to allocator when buddy will be freed. 837 * merge back to allocator when buddy will be freed.
838 * Corresponding page table entries will not be touched, 838 * Corresponding page table entries will not be touched,
839 * pages will stay not present in virtual address space 839 * pages will stay not present in virtual address space
840 */ 840 */
841 INIT_LIST_HEAD(&page[size].lru); 841 INIT_LIST_HEAD(&page[size].lru);
842 set_page_guard_flag(&page[size]); 842 set_page_guard_flag(&page[size]);
843 set_page_private(&page[size], high); 843 set_page_private(&page[size], high);
844 /* Guard pages are not available for any usage */ 844 /* Guard pages are not available for any usage */
845 __mod_zone_freepage_state(zone, -(1 << high), 845 __mod_zone_freepage_state(zone, -(1 << high),
846 migratetype); 846 migratetype);
847 continue; 847 continue;
848 } 848 }
849 #endif 849 #endif
850 list_add(&page[size].lru, &area->free_list[migratetype]); 850 list_add(&page[size].lru, &area->free_list[migratetype]);
851 area->nr_free++; 851 area->nr_free++;
852 set_page_order(&page[size], high); 852 set_page_order(&page[size], high);
853 } 853 }
854 } 854 }
855 855
856 /* 856 /*
857 * This page is about to be returned from the page allocator 857 * This page is about to be returned from the page allocator
858 */ 858 */
859 static inline int check_new_page(struct page *page) 859 static inline int check_new_page(struct page *page)
860 { 860 {
861 if (unlikely(page_mapcount(page) | 861 if (unlikely(page_mapcount(page) |
862 (page->mapping != NULL) | 862 (page->mapping != NULL) |
863 (atomic_read(&page->_count) != 0) | 863 (atomic_read(&page->_count) != 0) |
864 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 864 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
865 (mem_cgroup_bad_page_check(page)))) { 865 (mem_cgroup_bad_page_check(page)))) {
866 bad_page(page); 866 bad_page(page);
867 return 1; 867 return 1;
868 } 868 }
869 return 0; 869 return 0;
870 } 870 }
871 871
872 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 872 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
873 { 873 {
874 int i; 874 int i;
875 875
876 for (i = 0; i < (1 << order); i++) { 876 for (i = 0; i < (1 << order); i++) {
877 struct page *p = page + i; 877 struct page *p = page + i;
878 if (unlikely(check_new_page(p))) 878 if (unlikely(check_new_page(p)))
879 return 1; 879 return 1;
880 } 880 }
881 881
882 set_page_private(page, 0); 882 set_page_private(page, 0);
883 set_page_refcounted(page); 883 set_page_refcounted(page);
884 884
885 arch_alloc_page(page, order); 885 arch_alloc_page(page, order);
886 kernel_map_pages(page, 1 << order, 1); 886 kernel_map_pages(page, 1 << order, 1);
887 887
888 if (gfp_flags & __GFP_ZERO) 888 if (gfp_flags & __GFP_ZERO)
889 prep_zero_page(page, order, gfp_flags); 889 prep_zero_page(page, order, gfp_flags);
890 890
891 if (order && (gfp_flags & __GFP_COMP)) 891 if (order && (gfp_flags & __GFP_COMP))
892 prep_compound_page(page, order); 892 prep_compound_page(page, order);
893 893
894 return 0; 894 return 0;
895 } 895 }
896 896
897 /* 897 /*
898 * Go through the free lists for the given migratetype and remove 898 * Go through the free lists for the given migratetype and remove
899 * the smallest available page from the freelists 899 * the smallest available page from the freelists
900 */ 900 */
901 static inline 901 static inline
902 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 902 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
903 int migratetype) 903 int migratetype)
904 { 904 {
905 unsigned int current_order; 905 unsigned int current_order;
906 struct free_area *area; 906 struct free_area *area;
907 struct page *page; 907 struct page *page;
908 908
909 /* Find a page of the appropriate size in the preferred list */ 909 /* Find a page of the appropriate size in the preferred list */
910 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 910 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
911 area = &(zone->free_area[current_order]); 911 area = &(zone->free_area[current_order]);
912 if (list_empty(&area->free_list[migratetype])) 912 if (list_empty(&area->free_list[migratetype]))
913 continue; 913 continue;
914 914
915 page = list_entry(area->free_list[migratetype].next, 915 page = list_entry(area->free_list[migratetype].next,
916 struct page, lru); 916 struct page, lru);
917 list_del(&page->lru); 917 list_del(&page->lru);
918 rmv_page_order(page); 918 rmv_page_order(page);
919 area->nr_free--; 919 area->nr_free--;
920 expand(zone, page, order, current_order, area, migratetype); 920 expand(zone, page, order, current_order, area, migratetype);
921 set_freepage_migratetype(page, migratetype); 921 set_freepage_migratetype(page, migratetype);
922 return page; 922 return page;
923 } 923 }
924 924
925 return NULL; 925 return NULL;
926 } 926 }
927 927
928 928
929 /* 929 /*
930 * This array describes the order lists are fallen back to when 930 * This array describes the order lists are fallen back to when
931 * the free lists for the desirable migrate type are depleted 931 * the free lists for the desirable migrate type are depleted
932 */ 932 */
933 static int fallbacks[MIGRATE_TYPES][4] = { 933 static int fallbacks[MIGRATE_TYPES][4] = {
934 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 934 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
935 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 935 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
936 #ifdef CONFIG_CMA 936 #ifdef CONFIG_CMA
937 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 937 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
938 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 938 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
939 #else 939 #else
940 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 940 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
941 #endif 941 #endif
942 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 942 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
943 #ifdef CONFIG_MEMORY_ISOLATION 943 #ifdef CONFIG_MEMORY_ISOLATION
944 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 944 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
945 #endif 945 #endif
946 }; 946 };
947 947
948 /* 948 /*
949 * Move the free pages in a range to the free lists of the requested type. 949 * Move the free pages in a range to the free lists of the requested type.
950 * Note that start_page and end_pages are not aligned on a pageblock 950 * Note that start_page and end_pages are not aligned on a pageblock
951 * boundary. If alignment is required, use move_freepages_block() 951 * boundary. If alignment is required, use move_freepages_block()
952 */ 952 */
953 int move_freepages(struct zone *zone, 953 int move_freepages(struct zone *zone,
954 struct page *start_page, struct page *end_page, 954 struct page *start_page, struct page *end_page,
955 int migratetype) 955 int migratetype)
956 { 956 {
957 struct page *page; 957 struct page *page;
958 unsigned long order; 958 unsigned long order;
959 int pages_moved = 0; 959 int pages_moved = 0;
960 960
961 #ifndef CONFIG_HOLES_IN_ZONE 961 #ifndef CONFIG_HOLES_IN_ZONE
962 /* 962 /*
963 * page_zone is not safe to call in this context when 963 * page_zone is not safe to call in this context when
964 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 964 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
965 * anyway as we check zone boundaries in move_freepages_block(). 965 * anyway as we check zone boundaries in move_freepages_block().
966 * Remove at a later date when no bug reports exist related to 966 * Remove at a later date when no bug reports exist related to
967 * grouping pages by mobility 967 * grouping pages by mobility
968 */ 968 */
969 BUG_ON(page_zone(start_page) != page_zone(end_page)); 969 BUG_ON(page_zone(start_page) != page_zone(end_page));
970 #endif 970 #endif
971 971
972 for (page = start_page; page <= end_page;) { 972 for (page = start_page; page <= end_page;) {
973 /* Make sure we are not inadvertently changing nodes */ 973 /* Make sure we are not inadvertently changing nodes */
974 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 974 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
975 975
976 if (!pfn_valid_within(page_to_pfn(page))) { 976 if (!pfn_valid_within(page_to_pfn(page))) {
977 page++; 977 page++;
978 continue; 978 continue;
979 } 979 }
980 980
981 if (!PageBuddy(page)) { 981 if (!PageBuddy(page)) {
982 page++; 982 page++;
983 continue; 983 continue;
984 } 984 }
985 985
986 order = page_order(page); 986 order = page_order(page);
987 list_move(&page->lru, 987 list_move(&page->lru,
988 &zone->free_area[order].free_list[migratetype]); 988 &zone->free_area[order].free_list[migratetype]);
989 set_freepage_migratetype(page, migratetype); 989 set_freepage_migratetype(page, migratetype);
990 page += 1 << order; 990 page += 1 << order;
991 pages_moved += 1 << order; 991 pages_moved += 1 << order;
992 } 992 }
993 993
994 return pages_moved; 994 return pages_moved;
995 } 995 }
996 996
997 int move_freepages_block(struct zone *zone, struct page *page, 997 int move_freepages_block(struct zone *zone, struct page *page,
998 int migratetype) 998 int migratetype)
999 { 999 {
1000 unsigned long start_pfn, end_pfn; 1000 unsigned long start_pfn, end_pfn;
1001 struct page *start_page, *end_page; 1001 struct page *start_page, *end_page;
1002 1002
1003 start_pfn = page_to_pfn(page); 1003 start_pfn = page_to_pfn(page);
1004 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1004 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1005 start_page = pfn_to_page(start_pfn); 1005 start_page = pfn_to_page(start_pfn);
1006 end_page = start_page + pageblock_nr_pages - 1; 1006 end_page = start_page + pageblock_nr_pages - 1;
1007 end_pfn = start_pfn + pageblock_nr_pages - 1; 1007 end_pfn = start_pfn + pageblock_nr_pages - 1;
1008 1008
1009 /* Do not cross zone boundaries */ 1009 /* Do not cross zone boundaries */
1010 if (!zone_spans_pfn(zone, start_pfn)) 1010 if (!zone_spans_pfn(zone, start_pfn))
1011 start_page = page; 1011 start_page = page;
1012 if (!zone_spans_pfn(zone, end_pfn)) 1012 if (!zone_spans_pfn(zone, end_pfn))
1013 return 0; 1013 return 0;
1014 1014
1015 return move_freepages(zone, start_page, end_page, migratetype); 1015 return move_freepages(zone, start_page, end_page, migratetype);
1016 } 1016 }
1017 1017
1018 static void change_pageblock_range(struct page *pageblock_page, 1018 static void change_pageblock_range(struct page *pageblock_page,
1019 int start_order, int migratetype) 1019 int start_order, int migratetype)
1020 { 1020 {
1021 int nr_pageblocks = 1 << (start_order - pageblock_order); 1021 int nr_pageblocks = 1 << (start_order - pageblock_order);
1022 1022
1023 while (nr_pageblocks--) { 1023 while (nr_pageblocks--) {
1024 set_pageblock_migratetype(pageblock_page, migratetype); 1024 set_pageblock_migratetype(pageblock_page, migratetype);
1025 pageblock_page += pageblock_nr_pages; 1025 pageblock_page += pageblock_nr_pages;
1026 } 1026 }
1027 } 1027 }
1028 1028
1029 /* 1029 /*
1030 * If breaking a large block of pages, move all free pages to the preferred 1030 * If breaking a large block of pages, move all free pages to the preferred
1031 * allocation list. If falling back for a reclaimable kernel allocation, be 1031 * allocation list. If falling back for a reclaimable kernel allocation, be
1032 * more aggressive about taking ownership of free pages. 1032 * more aggressive about taking ownership of free pages.
1033 * 1033 *
1034 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1034 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1035 * nor move CMA pages to different free lists. We don't want unmovable pages 1035 * nor move CMA pages to different free lists. We don't want unmovable pages
1036 * to be allocated from MIGRATE_CMA areas. 1036 * to be allocated from MIGRATE_CMA areas.
1037 * 1037 *
1038 * Returns the new migratetype of the pageblock (or the same old migratetype 1038 * Returns the new migratetype of the pageblock (or the same old migratetype
1039 * if it was unchanged). 1039 * if it was unchanged).
1040 */ 1040 */
1041 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1041 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1042 int start_type, int fallback_type) 1042 int start_type, int fallback_type)
1043 { 1043 {
1044 int current_order = page_order(page); 1044 int current_order = page_order(page);
1045 1045
1046 /* 1046 /*
1047 * When borrowing from MIGRATE_CMA, we need to release the excess 1047 * When borrowing from MIGRATE_CMA, we need to release the excess
1048 * buddy pages to CMA itself. We also ensure the freepage_migratetype 1048 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1049 * is set to CMA so it is returned to the correct freelist in case 1049 * is set to CMA so it is returned to the correct freelist in case
1050 * the page ends up being not actually allocated from the pcp lists. 1050 * the page ends up being not actually allocated from the pcp lists.
1051 */ 1051 */
1052 if (is_migrate_cma(fallback_type)) 1052 if (is_migrate_cma(fallback_type))
1053 return fallback_type; 1053 return fallback_type;
1054 1054
1055 /* Take ownership for orders >= pageblock_order */ 1055 /* Take ownership for orders >= pageblock_order */
1056 if (current_order >= pageblock_order) { 1056 if (current_order >= pageblock_order) {
1057 change_pageblock_range(page, current_order, start_type); 1057 change_pageblock_range(page, current_order, start_type);
1058 return start_type; 1058 return start_type;
1059 } 1059 }
1060 1060
1061 if (current_order >= pageblock_order / 2 || 1061 if (current_order >= pageblock_order / 2 ||
1062 start_type == MIGRATE_RECLAIMABLE || 1062 start_type == MIGRATE_RECLAIMABLE ||
1063 page_group_by_mobility_disabled) { 1063 page_group_by_mobility_disabled) {
1064 int pages; 1064 int pages;
1065 1065
1066 pages = move_freepages_block(zone, page, start_type); 1066 pages = move_freepages_block(zone, page, start_type);
1067 1067
1068 /* Claim the whole block if over half of it is free */ 1068 /* Claim the whole block if over half of it is free */
1069 if (pages >= (1 << (pageblock_order-1)) || 1069 if (pages >= (1 << (pageblock_order-1)) ||
1070 page_group_by_mobility_disabled) { 1070 page_group_by_mobility_disabled) {
1071 1071
1072 set_pageblock_migratetype(page, start_type); 1072 set_pageblock_migratetype(page, start_type);
1073 return start_type; 1073 return start_type;
1074 } 1074 }
1075 1075
1076 } 1076 }
1077 1077
1078 return fallback_type; 1078 return fallback_type;
1079 } 1079 }
1080 1080
1081 /* Remove an element from the buddy allocator from the fallback list */ 1081 /* Remove an element from the buddy allocator from the fallback list */
1082 static inline struct page * 1082 static inline struct page *
1083 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1083 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1084 { 1084 {
1085 struct free_area *area; 1085 struct free_area *area;
1086 int current_order; 1086 int current_order;
1087 struct page *page; 1087 struct page *page;
1088 int migratetype, new_type, i; 1088 int migratetype, new_type, i;
1089 1089
1090 /* Find the largest possible block of pages in the other list */ 1090 /* Find the largest possible block of pages in the other list */
1091 for (current_order = MAX_ORDER-1; current_order >= order; 1091 for (current_order = MAX_ORDER-1; current_order >= order;
1092 --current_order) { 1092 --current_order) {
1093 for (i = 0;; i++) { 1093 for (i = 0;; i++) {
1094 migratetype = fallbacks[start_migratetype][i]; 1094 migratetype = fallbacks[start_migratetype][i];
1095 1095
1096 /* MIGRATE_RESERVE handled later if necessary */ 1096 /* MIGRATE_RESERVE handled later if necessary */
1097 if (migratetype == MIGRATE_RESERVE) 1097 if (migratetype == MIGRATE_RESERVE)
1098 break; 1098 break;
1099 1099
1100 area = &(zone->free_area[current_order]); 1100 area = &(zone->free_area[current_order]);
1101 if (list_empty(&area->free_list[migratetype])) 1101 if (list_empty(&area->free_list[migratetype]))
1102 continue; 1102 continue;
1103 1103
1104 page = list_entry(area->free_list[migratetype].next, 1104 page = list_entry(area->free_list[migratetype].next,
1105 struct page, lru); 1105 struct page, lru);
1106 area->nr_free--; 1106 area->nr_free--;
1107 1107
1108 new_type = try_to_steal_freepages(zone, page, 1108 new_type = try_to_steal_freepages(zone, page,
1109 start_migratetype, 1109 start_migratetype,
1110 migratetype); 1110 migratetype);
1111 1111
1112 /* Remove the page from the freelists */ 1112 /* Remove the page from the freelists */
1113 list_del(&page->lru); 1113 list_del(&page->lru);
1114 rmv_page_order(page); 1114 rmv_page_order(page);
1115 1115
1116 expand(zone, page, order, current_order, area, 1116 expand(zone, page, order, current_order, area,
1117 new_type); 1117 new_type);
1118 /* The freepage_migratetype may differ from pageblock's 1118 /* The freepage_migratetype may differ from pageblock's
1119 * migratetype depending on the decisions in 1119 * migratetype depending on the decisions in
1120 * try_to_steal_freepages. This is OK as long as it does 1120 * try_to_steal_freepages. This is OK as long as it does
1121 * not differ for MIGRATE_CMA type. 1121 * not differ for MIGRATE_CMA type.
1122 */ 1122 */
1123 set_freepage_migratetype(page, new_type); 1123 set_freepage_migratetype(page, new_type);
1124 1124
1125 trace_mm_page_alloc_extfrag(page, order, current_order, 1125 trace_mm_page_alloc_extfrag(page, order, current_order,
1126 start_migratetype, migratetype, new_type); 1126 start_migratetype, migratetype, new_type);
1127 1127
1128 return page; 1128 return page;
1129 } 1129 }
1130 } 1130 }
1131 1131
1132 return NULL; 1132 return NULL;
1133 } 1133 }
1134 1134
1135 /* 1135 /*
1136 * Do the hard work of removing an element from the buddy allocator. 1136 * Do the hard work of removing an element from the buddy allocator.
1137 * Call me with the zone->lock already held. 1137 * Call me with the zone->lock already held.
1138 */ 1138 */
1139 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1139 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1140 int migratetype) 1140 int migratetype)
1141 { 1141 {
1142 struct page *page; 1142 struct page *page;
1143 1143
1144 retry_reserve: 1144 retry_reserve:
1145 page = __rmqueue_smallest(zone, order, migratetype); 1145 page = __rmqueue_smallest(zone, order, migratetype);
1146 1146
1147 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1147 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1148 page = __rmqueue_fallback(zone, order, migratetype); 1148 page = __rmqueue_fallback(zone, order, migratetype);
1149 1149
1150 /* 1150 /*
1151 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1151 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1152 * is used because __rmqueue_smallest is an inline function 1152 * is used because __rmqueue_smallest is an inline function
1153 * and we want just one call site 1153 * and we want just one call site
1154 */ 1154 */
1155 if (!page) { 1155 if (!page) {
1156 migratetype = MIGRATE_RESERVE; 1156 migratetype = MIGRATE_RESERVE;
1157 goto retry_reserve; 1157 goto retry_reserve;
1158 } 1158 }
1159 } 1159 }
1160 1160
1161 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1161 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1162 return page; 1162 return page;
1163 } 1163 }
1164 1164
1165 /* 1165 /*
1166 * Obtain a specified number of elements from the buddy allocator, all under 1166 * Obtain a specified number of elements from the buddy allocator, all under
1167 * a single hold of the lock, for efficiency. Add them to the supplied list. 1167 * a single hold of the lock, for efficiency. Add them to the supplied list.
1168 * Returns the number of new pages which were placed at *list. 1168 * Returns the number of new pages which were placed at *list.
1169 */ 1169 */
1170 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1170 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1171 unsigned long count, struct list_head *list, 1171 unsigned long count, struct list_head *list,
1172 int migratetype, int cold) 1172 int migratetype, int cold)
1173 { 1173 {
1174 int i; 1174 int i;
1175 1175
1176 spin_lock(&zone->lock); 1176 spin_lock(&zone->lock);
1177 for (i = 0; i < count; ++i) { 1177 for (i = 0; i < count; ++i) {
1178 struct page *page = __rmqueue(zone, order, migratetype); 1178 struct page *page = __rmqueue(zone, order, migratetype);
1179 if (unlikely(page == NULL)) 1179 if (unlikely(page == NULL))
1180 break; 1180 break;
1181 1181
1182 /* 1182 /*
1183 * Split buddy pages returned by expand() are received here 1183 * Split buddy pages returned by expand() are received here
1184 * in physical page order. The page is added to the callers and 1184 * in physical page order. The page is added to the callers and
1185 * list and the list head then moves forward. From the callers 1185 * list and the list head then moves forward. From the callers
1186 * perspective, the linked list is ordered by page number in 1186 * perspective, the linked list is ordered by page number in
1187 * some conditions. This is useful for IO devices that can 1187 * some conditions. This is useful for IO devices that can
1188 * merge IO requests if the physical pages are ordered 1188 * merge IO requests if the physical pages are ordered
1189 * properly. 1189 * properly.
1190 */ 1190 */
1191 if (likely(cold == 0)) 1191 if (likely(cold == 0))
1192 list_add(&page->lru, list); 1192 list_add(&page->lru, list);
1193 else 1193 else
1194 list_add_tail(&page->lru, list); 1194 list_add_tail(&page->lru, list);
1195 list = &page->lru; 1195 list = &page->lru;
1196 if (is_migrate_cma(get_freepage_migratetype(page))) 1196 if (is_migrate_cma(get_freepage_migratetype(page)))
1197 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1197 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1198 -(1 << order)); 1198 -(1 << order));
1199 } 1199 }
1200 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1200 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1201 spin_unlock(&zone->lock); 1201 spin_unlock(&zone->lock);
1202 return i; 1202 return i;
1203 } 1203 }
1204 1204
1205 #ifdef CONFIG_NUMA 1205 #ifdef CONFIG_NUMA
1206 /* 1206 /*
1207 * Called from the vmstat counter updater to drain pagesets of this 1207 * Called from the vmstat counter updater to drain pagesets of this
1208 * currently executing processor on remote nodes after they have 1208 * currently executing processor on remote nodes after they have
1209 * expired. 1209 * expired.
1210 * 1210 *
1211 * Note that this function must be called with the thread pinned to 1211 * Note that this function must be called with the thread pinned to
1212 * a single processor. 1212 * a single processor.
1213 */ 1213 */
1214 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1214 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1215 { 1215 {
1216 unsigned long flags; 1216 unsigned long flags;
1217 int to_drain; 1217 int to_drain;
1218 unsigned long batch; 1218 unsigned long batch;
1219 1219
1220 local_irq_save(flags); 1220 local_irq_save(flags);
1221 batch = ACCESS_ONCE(pcp->batch); 1221 batch = ACCESS_ONCE(pcp->batch);
1222 if (pcp->count >= batch) 1222 if (pcp->count >= batch)
1223 to_drain = batch; 1223 to_drain = batch;
1224 else 1224 else
1225 to_drain = pcp->count; 1225 to_drain = pcp->count;
1226 if (to_drain > 0) { 1226 if (to_drain > 0) {
1227 free_pcppages_bulk(zone, to_drain, pcp); 1227 free_pcppages_bulk(zone, to_drain, pcp);
1228 pcp->count -= to_drain; 1228 pcp->count -= to_drain;
1229 } 1229 }
1230 local_irq_restore(flags); 1230 local_irq_restore(flags);
1231 } 1231 }
1232 #endif 1232 #endif
1233 1233
1234 /* 1234 /*
1235 * Drain pages of the indicated processor. 1235 * Drain pages of the indicated processor.
1236 * 1236 *
1237 * The processor must either be the current processor and the 1237 * The processor must either be the current processor and the
1238 * thread pinned to the current processor or a processor that 1238 * thread pinned to the current processor or a processor that
1239 * is not online. 1239 * is not online.
1240 */ 1240 */
1241 static void drain_pages(unsigned int cpu) 1241 static void drain_pages(unsigned int cpu)
1242 { 1242 {
1243 unsigned long flags; 1243 unsigned long flags;
1244 struct zone *zone; 1244 struct zone *zone;
1245 1245
1246 for_each_populated_zone(zone) { 1246 for_each_populated_zone(zone) {
1247 struct per_cpu_pageset *pset; 1247 struct per_cpu_pageset *pset;
1248 struct per_cpu_pages *pcp; 1248 struct per_cpu_pages *pcp;
1249 1249
1250 local_irq_save(flags); 1250 local_irq_save(flags);
1251 pset = per_cpu_ptr(zone->pageset, cpu); 1251 pset = per_cpu_ptr(zone->pageset, cpu);
1252 1252
1253 pcp = &pset->pcp; 1253 pcp = &pset->pcp;
1254 if (pcp->count) { 1254 if (pcp->count) {
1255 free_pcppages_bulk(zone, pcp->count, pcp); 1255 free_pcppages_bulk(zone, pcp->count, pcp);
1256 pcp->count = 0; 1256 pcp->count = 0;
1257 } 1257 }
1258 local_irq_restore(flags); 1258 local_irq_restore(flags);
1259 } 1259 }
1260 } 1260 }
1261 1261
1262 /* 1262 /*
1263 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1263 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1264 */ 1264 */
1265 void drain_local_pages(void *arg) 1265 void drain_local_pages(void *arg)
1266 { 1266 {
1267 drain_pages(smp_processor_id()); 1267 drain_pages(smp_processor_id());
1268 } 1268 }
1269 1269
1270 /* 1270 /*
1271 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1271 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1272 * 1272 *
1273 * Note that this code is protected against sending an IPI to an offline 1273 * Note that this code is protected against sending an IPI to an offline
1274 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1274 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1275 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1275 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1276 * nothing keeps CPUs from showing up after we populated the cpumask and 1276 * nothing keeps CPUs from showing up after we populated the cpumask and
1277 * before the call to on_each_cpu_mask(). 1277 * before the call to on_each_cpu_mask().
1278 */ 1278 */
1279 void drain_all_pages(void) 1279 void drain_all_pages(void)
1280 { 1280 {
1281 int cpu; 1281 int cpu;
1282 struct per_cpu_pageset *pcp; 1282 struct per_cpu_pageset *pcp;
1283 struct zone *zone; 1283 struct zone *zone;
1284 1284
1285 /* 1285 /*
1286 * Allocate in the BSS so we wont require allocation in 1286 * Allocate in the BSS so we wont require allocation in
1287 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1287 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1288 */ 1288 */
1289 static cpumask_t cpus_with_pcps; 1289 static cpumask_t cpus_with_pcps;
1290 1290
1291 /* 1291 /*
1292 * We don't care about racing with CPU hotplug event 1292 * We don't care about racing with CPU hotplug event
1293 * as offline notification will cause the notified 1293 * as offline notification will cause the notified
1294 * cpu to drain that CPU pcps and on_each_cpu_mask 1294 * cpu to drain that CPU pcps and on_each_cpu_mask
1295 * disables preemption as part of its processing 1295 * disables preemption as part of its processing
1296 */ 1296 */
1297 for_each_online_cpu(cpu) { 1297 for_each_online_cpu(cpu) {
1298 bool has_pcps = false; 1298 bool has_pcps = false;
1299 for_each_populated_zone(zone) { 1299 for_each_populated_zone(zone) {
1300 pcp = per_cpu_ptr(zone->pageset, cpu); 1300 pcp = per_cpu_ptr(zone->pageset, cpu);
1301 if (pcp->pcp.count) { 1301 if (pcp->pcp.count) {
1302 has_pcps = true; 1302 has_pcps = true;
1303 break; 1303 break;
1304 } 1304 }
1305 } 1305 }
1306 if (has_pcps) 1306 if (has_pcps)
1307 cpumask_set_cpu(cpu, &cpus_with_pcps); 1307 cpumask_set_cpu(cpu, &cpus_with_pcps);
1308 else 1308 else
1309 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1309 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1310 } 1310 }
1311 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1311 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1312 } 1312 }
1313 1313
1314 #ifdef CONFIG_HIBERNATION 1314 #ifdef CONFIG_HIBERNATION
1315 1315
1316 void mark_free_pages(struct zone *zone) 1316 void mark_free_pages(struct zone *zone)
1317 { 1317 {
1318 unsigned long pfn, max_zone_pfn; 1318 unsigned long pfn, max_zone_pfn;
1319 unsigned long flags; 1319 unsigned long flags;
1320 int order, t; 1320 int order, t;
1321 struct list_head *curr; 1321 struct list_head *curr;
1322 1322
1323 if (zone_is_empty(zone)) 1323 if (zone_is_empty(zone))
1324 return; 1324 return;
1325 1325
1326 spin_lock_irqsave(&zone->lock, flags); 1326 spin_lock_irqsave(&zone->lock, flags);
1327 1327
1328 max_zone_pfn = zone_end_pfn(zone); 1328 max_zone_pfn = zone_end_pfn(zone);
1329 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1329 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1330 if (pfn_valid(pfn)) { 1330 if (pfn_valid(pfn)) {
1331 struct page *page = pfn_to_page(pfn); 1331 struct page *page = pfn_to_page(pfn);
1332 1332
1333 if (!swsusp_page_is_forbidden(page)) 1333 if (!swsusp_page_is_forbidden(page))
1334 swsusp_unset_page_free(page); 1334 swsusp_unset_page_free(page);
1335 } 1335 }
1336 1336
1337 for_each_migratetype_order(order, t) { 1337 for_each_migratetype_order(order, t) {
1338 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1338 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1339 unsigned long i; 1339 unsigned long i;
1340 1340
1341 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1341 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1342 for (i = 0; i < (1UL << order); i++) 1342 for (i = 0; i < (1UL << order); i++)
1343 swsusp_set_page_free(pfn_to_page(pfn + i)); 1343 swsusp_set_page_free(pfn_to_page(pfn + i));
1344 } 1344 }
1345 } 1345 }
1346 spin_unlock_irqrestore(&zone->lock, flags); 1346 spin_unlock_irqrestore(&zone->lock, flags);
1347 } 1347 }
1348 #endif /* CONFIG_PM */ 1348 #endif /* CONFIG_PM */
1349 1349
1350 /* 1350 /*
1351 * Free a 0-order page 1351 * Free a 0-order page
1352 * cold == 1 ? free a cold page : free a hot page 1352 * cold == 1 ? free a cold page : free a hot page
1353 */ 1353 */
1354 void free_hot_cold_page(struct page *page, int cold) 1354 void free_hot_cold_page(struct page *page, int cold)
1355 { 1355 {
1356 struct zone *zone = page_zone(page); 1356 struct zone *zone = page_zone(page);
1357 struct per_cpu_pages *pcp; 1357 struct per_cpu_pages *pcp;
1358 unsigned long flags; 1358 unsigned long flags;
1359 int migratetype; 1359 int migratetype;
1360 1360
1361 if (!free_pages_prepare(page, 0)) 1361 if (!free_pages_prepare(page, 0))
1362 return; 1362 return;
1363 1363
1364 migratetype = get_pageblock_migratetype(page); 1364 migratetype = get_pageblock_migratetype(page);
1365 set_freepage_migratetype(page, migratetype); 1365 set_freepage_migratetype(page, migratetype);
1366 local_irq_save(flags); 1366 local_irq_save(flags);
1367 __count_vm_event(PGFREE); 1367 __count_vm_event(PGFREE);
1368 1368
1369 /* 1369 /*
1370 * We only track unmovable, reclaimable and movable on pcp lists. 1370 * We only track unmovable, reclaimable and movable on pcp lists.
1371 * Free ISOLATE pages back to the allocator because they are being 1371 * Free ISOLATE pages back to the allocator because they are being
1372 * offlined but treat RESERVE as movable pages so we can get those 1372 * offlined but treat RESERVE as movable pages so we can get those
1373 * areas back if necessary. Otherwise, we may have to free 1373 * areas back if necessary. Otherwise, we may have to free
1374 * excessively into the page allocator 1374 * excessively into the page allocator
1375 */ 1375 */
1376 if (migratetype >= MIGRATE_PCPTYPES) { 1376 if (migratetype >= MIGRATE_PCPTYPES) {
1377 if (unlikely(is_migrate_isolate(migratetype))) { 1377 if (unlikely(is_migrate_isolate(migratetype))) {
1378 free_one_page(zone, page, 0, migratetype); 1378 free_one_page(zone, page, 0, migratetype);
1379 goto out; 1379 goto out;
1380 } 1380 }
1381 migratetype = MIGRATE_MOVABLE; 1381 migratetype = MIGRATE_MOVABLE;
1382 } 1382 }
1383 1383
1384 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1384 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1385 if (cold) 1385 if (cold)
1386 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1386 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1387 else 1387 else
1388 list_add(&page->lru, &pcp->lists[migratetype]); 1388 list_add(&page->lru, &pcp->lists[migratetype]);
1389 pcp->count++; 1389 pcp->count++;
1390 if (pcp->count >= pcp->high) { 1390 if (pcp->count >= pcp->high) {
1391 unsigned long batch = ACCESS_ONCE(pcp->batch); 1391 unsigned long batch = ACCESS_ONCE(pcp->batch);
1392 free_pcppages_bulk(zone, batch, pcp); 1392 free_pcppages_bulk(zone, batch, pcp);
1393 pcp->count -= batch; 1393 pcp->count -= batch;
1394 } 1394 }
1395 1395
1396 out: 1396 out:
1397 local_irq_restore(flags); 1397 local_irq_restore(flags);
1398 } 1398 }
1399 1399
1400 /* 1400 /*
1401 * Free a list of 0-order pages 1401 * Free a list of 0-order pages
1402 */ 1402 */
1403 void free_hot_cold_page_list(struct list_head *list, int cold) 1403 void free_hot_cold_page_list(struct list_head *list, int cold)
1404 { 1404 {
1405 struct page *page, *next; 1405 struct page *page, *next;
1406 1406
1407 list_for_each_entry_safe(page, next, list, lru) { 1407 list_for_each_entry_safe(page, next, list, lru) {
1408 trace_mm_page_free_batched(page, cold); 1408 trace_mm_page_free_batched(page, cold);
1409 free_hot_cold_page(page, cold); 1409 free_hot_cold_page(page, cold);
1410 } 1410 }
1411 } 1411 }
1412 1412
1413 /* 1413 /*
1414 * split_page takes a non-compound higher-order page, and splits it into 1414 * split_page takes a non-compound higher-order page, and splits it into
1415 * n (1<<order) sub-pages: page[0..n] 1415 * n (1<<order) sub-pages: page[0..n]
1416 * Each sub-page must be freed individually. 1416 * Each sub-page must be freed individually.
1417 * 1417 *
1418 * Note: this is probably too low level an operation for use in drivers. 1418 * Note: this is probably too low level an operation for use in drivers.
1419 * Please consult with lkml before using this in your driver. 1419 * Please consult with lkml before using this in your driver.
1420 */ 1420 */
1421 void split_page(struct page *page, unsigned int order) 1421 void split_page(struct page *page, unsigned int order)
1422 { 1422 {
1423 int i; 1423 int i;
1424 1424
1425 VM_BUG_ON(PageCompound(page)); 1425 VM_BUG_ON(PageCompound(page));
1426 VM_BUG_ON(!page_count(page)); 1426 VM_BUG_ON(!page_count(page));
1427 1427
1428 #ifdef CONFIG_KMEMCHECK 1428 #ifdef CONFIG_KMEMCHECK
1429 /* 1429 /*
1430 * Split shadow pages too, because free(page[0]) would 1430 * Split shadow pages too, because free(page[0]) would
1431 * otherwise free the whole shadow. 1431 * otherwise free the whole shadow.
1432 */ 1432 */
1433 if (kmemcheck_page_is_tracked(page)) 1433 if (kmemcheck_page_is_tracked(page))
1434 split_page(virt_to_page(page[0].shadow), order); 1434 split_page(virt_to_page(page[0].shadow), order);
1435 #endif 1435 #endif
1436 1436
1437 for (i = 1; i < (1 << order); i++) 1437 for (i = 1; i < (1 << order); i++)
1438 set_page_refcounted(page + i); 1438 set_page_refcounted(page + i);
1439 } 1439 }
1440 EXPORT_SYMBOL_GPL(split_page); 1440 EXPORT_SYMBOL_GPL(split_page);
1441 1441
1442 static int __isolate_free_page(struct page *page, unsigned int order) 1442 static int __isolate_free_page(struct page *page, unsigned int order)
1443 { 1443 {
1444 unsigned long watermark; 1444 unsigned long watermark;
1445 struct zone *zone; 1445 struct zone *zone;
1446 int mt; 1446 int mt;
1447 1447
1448 BUG_ON(!PageBuddy(page)); 1448 BUG_ON(!PageBuddy(page));
1449 1449
1450 zone = page_zone(page); 1450 zone = page_zone(page);
1451 mt = get_pageblock_migratetype(page); 1451 mt = get_pageblock_migratetype(page);
1452 1452
1453 if (!is_migrate_isolate(mt)) { 1453 if (!is_migrate_isolate(mt)) {
1454 /* Obey watermarks as if the page was being allocated */ 1454 /* Obey watermarks as if the page was being allocated */
1455 watermark = low_wmark_pages(zone) + (1 << order); 1455 watermark = low_wmark_pages(zone) + (1 << order);
1456 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1456 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1457 return 0; 1457 return 0;
1458 1458
1459 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1459 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1460 } 1460 }
1461 1461
1462 /* Remove page from free list */ 1462 /* Remove page from free list */
1463 list_del(&page->lru); 1463 list_del(&page->lru);
1464 zone->free_area[order].nr_free--; 1464 zone->free_area[order].nr_free--;
1465 rmv_page_order(page); 1465 rmv_page_order(page);
1466 1466
1467 /* Set the pageblock if the isolated page is at least a pageblock */ 1467 /* Set the pageblock if the isolated page is at least a pageblock */
1468 if (order >= pageblock_order - 1) { 1468 if (order >= pageblock_order - 1) {
1469 struct page *endpage = page + (1 << order) - 1; 1469 struct page *endpage = page + (1 << order) - 1;
1470 for (; page < endpage; page += pageblock_nr_pages) { 1470 for (; page < endpage; page += pageblock_nr_pages) {
1471 int mt = get_pageblock_migratetype(page); 1471 int mt = get_pageblock_migratetype(page);
1472 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1472 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1473 set_pageblock_migratetype(page, 1473 set_pageblock_migratetype(page,
1474 MIGRATE_MOVABLE); 1474 MIGRATE_MOVABLE);
1475 } 1475 }
1476 } 1476 }
1477 1477
1478 return 1UL << order; 1478 return 1UL << order;
1479 } 1479 }
1480 1480
1481 /* 1481 /*
1482 * Similar to split_page except the page is already free. As this is only 1482 * Similar to split_page except the page is already free. As this is only
1483 * being used for migration, the migratetype of the block also changes. 1483 * being used for migration, the migratetype of the block also changes.
1484 * As this is called with interrupts disabled, the caller is responsible 1484 * As this is called with interrupts disabled, the caller is responsible
1485 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1485 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1486 * are enabled. 1486 * are enabled.
1487 * 1487 *
1488 * Note: this is probably too low level an operation for use in drivers. 1488 * Note: this is probably too low level an operation for use in drivers.
1489 * Please consult with lkml before using this in your driver. 1489 * Please consult with lkml before using this in your driver.
1490 */ 1490 */
1491 int split_free_page(struct page *page) 1491 int split_free_page(struct page *page)
1492 { 1492 {
1493 unsigned int order; 1493 unsigned int order;
1494 int nr_pages; 1494 int nr_pages;
1495 1495
1496 order = page_order(page); 1496 order = page_order(page);
1497 1497
1498 nr_pages = __isolate_free_page(page, order); 1498 nr_pages = __isolate_free_page(page, order);
1499 if (!nr_pages) 1499 if (!nr_pages)
1500 return 0; 1500 return 0;
1501 1501
1502 /* Split into individual pages */ 1502 /* Split into individual pages */
1503 set_page_refcounted(page); 1503 set_page_refcounted(page);
1504 split_page(page, order); 1504 split_page(page, order);
1505 return nr_pages; 1505 return nr_pages;
1506 } 1506 }
1507 1507
1508 /* 1508 /*
1509 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1509 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1510 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1510 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1511 * or two. 1511 * or two.
1512 */ 1512 */
1513 static inline 1513 static inline
1514 struct page *buffered_rmqueue(struct zone *preferred_zone, 1514 struct page *buffered_rmqueue(struct zone *preferred_zone,
1515 struct zone *zone, int order, gfp_t gfp_flags, 1515 struct zone *zone, int order, gfp_t gfp_flags,
1516 int migratetype) 1516 int migratetype)
1517 { 1517 {
1518 unsigned long flags; 1518 unsigned long flags;
1519 struct page *page; 1519 struct page *page;
1520 int cold = !!(gfp_flags & __GFP_COLD); 1520 int cold = !!(gfp_flags & __GFP_COLD);
1521 1521
1522 again: 1522 again:
1523 if (likely(order == 0)) { 1523 if (likely(order == 0)) {
1524 struct per_cpu_pages *pcp; 1524 struct per_cpu_pages *pcp;
1525 struct list_head *list; 1525 struct list_head *list;
1526 1526
1527 local_irq_save(flags); 1527 local_irq_save(flags);
1528 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1528 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1529 list = &pcp->lists[migratetype]; 1529 list = &pcp->lists[migratetype];
1530 if (list_empty(list)) { 1530 if (list_empty(list)) {
1531 pcp->count += rmqueue_bulk(zone, 0, 1531 pcp->count += rmqueue_bulk(zone, 0,
1532 pcp->batch, list, 1532 pcp->batch, list,
1533 migratetype, cold); 1533 migratetype, cold);
1534 if (unlikely(list_empty(list))) 1534 if (unlikely(list_empty(list)))
1535 goto failed; 1535 goto failed;
1536 } 1536 }
1537 1537
1538 if (cold) 1538 if (cold)
1539 page = list_entry(list->prev, struct page, lru); 1539 page = list_entry(list->prev, struct page, lru);
1540 else 1540 else
1541 page = list_entry(list->next, struct page, lru); 1541 page = list_entry(list->next, struct page, lru);
1542 1542
1543 list_del(&page->lru); 1543 list_del(&page->lru);
1544 pcp->count--; 1544 pcp->count--;
1545 } else { 1545 } else {
1546 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1546 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1547 /* 1547 /*
1548 * __GFP_NOFAIL is not to be used in new code. 1548 * __GFP_NOFAIL is not to be used in new code.
1549 * 1549 *
1550 * All __GFP_NOFAIL callers should be fixed so that they 1550 * All __GFP_NOFAIL callers should be fixed so that they
1551 * properly detect and handle allocation failures. 1551 * properly detect and handle allocation failures.
1552 * 1552 *
1553 * We most definitely don't want callers attempting to 1553 * We most definitely don't want callers attempting to
1554 * allocate greater than order-1 page units with 1554 * allocate greater than order-1 page units with
1555 * __GFP_NOFAIL. 1555 * __GFP_NOFAIL.
1556 */ 1556 */
1557 WARN_ON_ONCE(order > 1); 1557 WARN_ON_ONCE(order > 1);
1558 } 1558 }
1559 spin_lock_irqsave(&zone->lock, flags); 1559 spin_lock_irqsave(&zone->lock, flags);
1560 page = __rmqueue(zone, order, migratetype); 1560 page = __rmqueue(zone, order, migratetype);
1561 spin_unlock(&zone->lock); 1561 spin_unlock(&zone->lock);
1562 if (!page) 1562 if (!page)
1563 goto failed; 1563 goto failed;
1564 __mod_zone_freepage_state(zone, -(1 << order), 1564 __mod_zone_freepage_state(zone, -(1 << order),
1565 get_freepage_migratetype(page)); 1565 get_freepage_migratetype(page));
1566 } 1566 }
1567 1567
1568 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1568 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1569 1569
1570 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1570 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1571 zone_statistics(preferred_zone, zone, gfp_flags); 1571 zone_statistics(preferred_zone, zone, gfp_flags);
1572 local_irq_restore(flags); 1572 local_irq_restore(flags);
1573 1573
1574 VM_BUG_ON(bad_range(zone, page)); 1574 VM_BUG_ON(bad_range(zone, page));
1575 if (prep_new_page(page, order, gfp_flags)) 1575 if (prep_new_page(page, order, gfp_flags))
1576 goto again; 1576 goto again;
1577 return page; 1577 return page;
1578 1578
1579 failed: 1579 failed:
1580 local_irq_restore(flags); 1580 local_irq_restore(flags);
1581 return NULL; 1581 return NULL;
1582 } 1582 }
1583 1583
1584 #ifdef CONFIG_FAIL_PAGE_ALLOC 1584 #ifdef CONFIG_FAIL_PAGE_ALLOC
1585 1585
1586 static struct { 1586 static struct {
1587 struct fault_attr attr; 1587 struct fault_attr attr;
1588 1588
1589 u32 ignore_gfp_highmem; 1589 u32 ignore_gfp_highmem;
1590 u32 ignore_gfp_wait; 1590 u32 ignore_gfp_wait;
1591 u32 min_order; 1591 u32 min_order;
1592 } fail_page_alloc = { 1592 } fail_page_alloc = {
1593 .attr = FAULT_ATTR_INITIALIZER, 1593 .attr = FAULT_ATTR_INITIALIZER,
1594 .ignore_gfp_wait = 1, 1594 .ignore_gfp_wait = 1,
1595 .ignore_gfp_highmem = 1, 1595 .ignore_gfp_highmem = 1,
1596 .min_order = 1, 1596 .min_order = 1,
1597 }; 1597 };
1598 1598
1599 static int __init setup_fail_page_alloc(char *str) 1599 static int __init setup_fail_page_alloc(char *str)
1600 { 1600 {
1601 return setup_fault_attr(&fail_page_alloc.attr, str); 1601 return setup_fault_attr(&fail_page_alloc.attr, str);
1602 } 1602 }
1603 __setup("fail_page_alloc=", setup_fail_page_alloc); 1603 __setup("fail_page_alloc=", setup_fail_page_alloc);
1604 1604
1605 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1605 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1606 { 1606 {
1607 if (order < fail_page_alloc.min_order) 1607 if (order < fail_page_alloc.min_order)
1608 return false; 1608 return false;
1609 if (gfp_mask & __GFP_NOFAIL) 1609 if (gfp_mask & __GFP_NOFAIL)
1610 return false; 1610 return false;
1611 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1611 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1612 return false; 1612 return false;
1613 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1613 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1614 return false; 1614 return false;
1615 1615
1616 return should_fail(&fail_page_alloc.attr, 1 << order); 1616 return should_fail(&fail_page_alloc.attr, 1 << order);
1617 } 1617 }
1618 1618
1619 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1619 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1620 1620
1621 static int __init fail_page_alloc_debugfs(void) 1621 static int __init fail_page_alloc_debugfs(void)
1622 { 1622 {
1623 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1623 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1624 struct dentry *dir; 1624 struct dentry *dir;
1625 1625
1626 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1626 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1627 &fail_page_alloc.attr); 1627 &fail_page_alloc.attr);
1628 if (IS_ERR(dir)) 1628 if (IS_ERR(dir))
1629 return PTR_ERR(dir); 1629 return PTR_ERR(dir);
1630 1630
1631 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1631 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1632 &fail_page_alloc.ignore_gfp_wait)) 1632 &fail_page_alloc.ignore_gfp_wait))
1633 goto fail; 1633 goto fail;
1634 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1634 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1635 &fail_page_alloc.ignore_gfp_highmem)) 1635 &fail_page_alloc.ignore_gfp_highmem))
1636 goto fail; 1636 goto fail;
1637 if (!debugfs_create_u32("min-order", mode, dir, 1637 if (!debugfs_create_u32("min-order", mode, dir,
1638 &fail_page_alloc.min_order)) 1638 &fail_page_alloc.min_order))
1639 goto fail; 1639 goto fail;
1640 1640
1641 return 0; 1641 return 0;
1642 fail: 1642 fail:
1643 debugfs_remove_recursive(dir); 1643 debugfs_remove_recursive(dir);
1644 1644
1645 return -ENOMEM; 1645 return -ENOMEM;
1646 } 1646 }
1647 1647
1648 late_initcall(fail_page_alloc_debugfs); 1648 late_initcall(fail_page_alloc_debugfs);
1649 1649
1650 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1650 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1651 1651
1652 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1652 #else /* CONFIG_FAIL_PAGE_ALLOC */
1653 1653
1654 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1654 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1655 { 1655 {
1656 return false; 1656 return false;
1657 } 1657 }
1658 1658
1659 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1659 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1660 1660
1661 /* 1661 /*
1662 * Return true if free pages are above 'mark'. This takes into account the order 1662 * Return true if free pages are above 'mark'. This takes into account the order
1663 * of the allocation. 1663 * of the allocation.
1664 */ 1664 */
1665 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1665 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1666 int classzone_idx, int alloc_flags, long free_pages) 1666 int classzone_idx, int alloc_flags, long free_pages)
1667 { 1667 {
1668 /* free_pages my go negative - that's OK */ 1668 /* free_pages my go negative - that's OK */
1669 long min = mark; 1669 long min = mark;
1670 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1670 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1671 int o; 1671 int o;
1672 long free_cma = 0; 1672 long free_cma = 0;
1673 1673
1674 free_pages -= (1 << order) - 1; 1674 free_pages -= (1 << order) - 1;
1675 if (alloc_flags & ALLOC_HIGH) 1675 if (alloc_flags & ALLOC_HIGH)
1676 min -= min / 2; 1676 min -= min / 2;
1677 if (alloc_flags & ALLOC_HARDER) 1677 if (alloc_flags & ALLOC_HARDER)
1678 min -= min / 4; 1678 min -= min / 4;
1679 #ifdef CONFIG_CMA 1679 #ifdef CONFIG_CMA
1680 /* If allocation can't use CMA areas don't use free CMA pages */ 1680 /* If allocation can't use CMA areas don't use free CMA pages */
1681 if (!(alloc_flags & ALLOC_CMA)) 1681 if (!(alloc_flags & ALLOC_CMA))
1682 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1682 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1683 #endif 1683 #endif
1684 1684
1685 if (free_pages - free_cma <= min + lowmem_reserve) 1685 if (free_pages - free_cma <= min + lowmem_reserve)
1686 return false; 1686 return false;
1687 for (o = 0; o < order; o++) { 1687 for (o = 0; o < order; o++) {
1688 /* At the next order, this order's pages become unavailable */ 1688 /* At the next order, this order's pages become unavailable */
1689 free_pages -= z->free_area[o].nr_free << o; 1689 free_pages -= z->free_area[o].nr_free << o;
1690 1690
1691 /* Require fewer higher order pages to be free */ 1691 /* Require fewer higher order pages to be free */
1692 min >>= 1; 1692 min >>= 1;
1693 1693
1694 if (free_pages <= min) 1694 if (free_pages <= min)
1695 return false; 1695 return false;
1696 } 1696 }
1697 return true; 1697 return true;
1698 } 1698 }
1699 1699
1700 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1700 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1701 int classzone_idx, int alloc_flags) 1701 int classzone_idx, int alloc_flags)
1702 { 1702 {
1703 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1703 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1704 zone_page_state(z, NR_FREE_PAGES)); 1704 zone_page_state(z, NR_FREE_PAGES));
1705 } 1705 }
1706 1706
1707 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1707 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1708 int classzone_idx, int alloc_flags) 1708 int classzone_idx, int alloc_flags)
1709 { 1709 {
1710 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1710 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1711 1711
1712 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1712 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1713 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1713 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1714 1714
1715 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1715 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1716 free_pages); 1716 free_pages);
1717 } 1717 }
1718 1718
1719 #ifdef CONFIG_NUMA 1719 #ifdef CONFIG_NUMA
1720 /* 1720 /*
1721 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1721 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1722 * skip over zones that are not allowed by the cpuset, or that have 1722 * skip over zones that are not allowed by the cpuset, or that have
1723 * been recently (in last second) found to be nearly full. See further 1723 * been recently (in last second) found to be nearly full. See further
1724 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1724 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1725 * that have to skip over a lot of full or unallowed zones. 1725 * that have to skip over a lot of full or unallowed zones.
1726 * 1726 *
1727 * If the zonelist cache is present in the passed in zonelist, then 1727 * If the zonelist cache is present in the passed in zonelist, then
1728 * returns a pointer to the allowed node mask (either the current 1728 * returns a pointer to the allowed node mask (either the current
1729 * tasks mems_allowed, or node_states[N_MEMORY].) 1729 * tasks mems_allowed, or node_states[N_MEMORY].)
1730 * 1730 *
1731 * If the zonelist cache is not available for this zonelist, does 1731 * If the zonelist cache is not available for this zonelist, does
1732 * nothing and returns NULL. 1732 * nothing and returns NULL.
1733 * 1733 *
1734 * If the fullzones BITMAP in the zonelist cache is stale (more than 1734 * If the fullzones BITMAP in the zonelist cache is stale (more than
1735 * a second since last zap'd) then we zap it out (clear its bits.) 1735 * a second since last zap'd) then we zap it out (clear its bits.)
1736 * 1736 *
1737 * We hold off even calling zlc_setup, until after we've checked the 1737 * We hold off even calling zlc_setup, until after we've checked the
1738 * first zone in the zonelist, on the theory that most allocations will 1738 * first zone in the zonelist, on the theory that most allocations will
1739 * be satisfied from that first zone, so best to examine that zone as 1739 * be satisfied from that first zone, so best to examine that zone as
1740 * quickly as we can. 1740 * quickly as we can.
1741 */ 1741 */
1742 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1742 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1743 { 1743 {
1744 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1744 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1745 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1745 nodemask_t *allowednodes; /* zonelist_cache approximation */
1746 1746
1747 zlc = zonelist->zlcache_ptr; 1747 zlc = zonelist->zlcache_ptr;
1748 if (!zlc) 1748 if (!zlc)
1749 return NULL; 1749 return NULL;
1750 1750
1751 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1751 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1752 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1752 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1753 zlc->last_full_zap = jiffies; 1753 zlc->last_full_zap = jiffies;
1754 } 1754 }
1755 1755
1756 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1756 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1757 &cpuset_current_mems_allowed : 1757 &cpuset_current_mems_allowed :
1758 &node_states[N_MEMORY]; 1758 &node_states[N_MEMORY];
1759 return allowednodes; 1759 return allowednodes;
1760 } 1760 }
1761 1761
1762 /* 1762 /*
1763 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1763 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1764 * if it is worth looking at further for free memory: 1764 * if it is worth looking at further for free memory:
1765 * 1) Check that the zone isn't thought to be full (doesn't have its 1765 * 1) Check that the zone isn't thought to be full (doesn't have its
1766 * bit set in the zonelist_cache fullzones BITMAP). 1766 * bit set in the zonelist_cache fullzones BITMAP).
1767 * 2) Check that the zones node (obtained from the zonelist_cache 1767 * 2) Check that the zones node (obtained from the zonelist_cache
1768 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1768 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1769 * Return true (non-zero) if zone is worth looking at further, or 1769 * Return true (non-zero) if zone is worth looking at further, or
1770 * else return false (zero) if it is not. 1770 * else return false (zero) if it is not.
1771 * 1771 *
1772 * This check -ignores- the distinction between various watermarks, 1772 * This check -ignores- the distinction between various watermarks,
1773 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1773 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1774 * found to be full for any variation of these watermarks, it will 1774 * found to be full for any variation of these watermarks, it will
1775 * be considered full for up to one second by all requests, unless 1775 * be considered full for up to one second by all requests, unless
1776 * we are so low on memory on all allowed nodes that we are forced 1776 * we are so low on memory on all allowed nodes that we are forced
1777 * into the second scan of the zonelist. 1777 * into the second scan of the zonelist.
1778 * 1778 *
1779 * In the second scan we ignore this zonelist cache and exactly 1779 * In the second scan we ignore this zonelist cache and exactly
1780 * apply the watermarks to all zones, even it is slower to do so. 1780 * apply the watermarks to all zones, even it is slower to do so.
1781 * We are low on memory in the second scan, and should leave no stone 1781 * We are low on memory in the second scan, and should leave no stone
1782 * unturned looking for a free page. 1782 * unturned looking for a free page.
1783 */ 1783 */
1784 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1784 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1785 nodemask_t *allowednodes) 1785 nodemask_t *allowednodes)
1786 { 1786 {
1787 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1787 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1788 int i; /* index of *z in zonelist zones */ 1788 int i; /* index of *z in zonelist zones */
1789 int n; /* node that zone *z is on */ 1789 int n; /* node that zone *z is on */
1790 1790
1791 zlc = zonelist->zlcache_ptr; 1791 zlc = zonelist->zlcache_ptr;
1792 if (!zlc) 1792 if (!zlc)
1793 return 1; 1793 return 1;
1794 1794
1795 i = z - zonelist->_zonerefs; 1795 i = z - zonelist->_zonerefs;
1796 n = zlc->z_to_n[i]; 1796 n = zlc->z_to_n[i];
1797 1797
1798 /* This zone is worth trying if it is allowed but not full */ 1798 /* This zone is worth trying if it is allowed but not full */
1799 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1799 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1800 } 1800 }
1801 1801
1802 /* 1802 /*
1803 * Given 'z' scanning a zonelist, set the corresponding bit in 1803 * Given 'z' scanning a zonelist, set the corresponding bit in
1804 * zlc->fullzones, so that subsequent attempts to allocate a page 1804 * zlc->fullzones, so that subsequent attempts to allocate a page
1805 * from that zone don't waste time re-examining it. 1805 * from that zone don't waste time re-examining it.
1806 */ 1806 */
1807 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1807 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1808 { 1808 {
1809 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1809 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1810 int i; /* index of *z in zonelist zones */ 1810 int i; /* index of *z in zonelist zones */
1811 1811
1812 zlc = zonelist->zlcache_ptr; 1812 zlc = zonelist->zlcache_ptr;
1813 if (!zlc) 1813 if (!zlc)
1814 return; 1814 return;
1815 1815
1816 i = z - zonelist->_zonerefs; 1816 i = z - zonelist->_zonerefs;
1817 1817
1818 set_bit(i, zlc->fullzones); 1818 set_bit(i, zlc->fullzones);
1819 } 1819 }
1820 1820
1821 /* 1821 /*
1822 * clear all zones full, called after direct reclaim makes progress so that 1822 * clear all zones full, called after direct reclaim makes progress so that
1823 * a zone that was recently full is not skipped over for up to a second 1823 * a zone that was recently full is not skipped over for up to a second
1824 */ 1824 */
1825 static void zlc_clear_zones_full(struct zonelist *zonelist) 1825 static void zlc_clear_zones_full(struct zonelist *zonelist)
1826 { 1826 {
1827 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1827 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1828 1828
1829 zlc = zonelist->zlcache_ptr; 1829 zlc = zonelist->zlcache_ptr;
1830 if (!zlc) 1830 if (!zlc)
1831 return; 1831 return;
1832 1832
1833 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1833 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1834 } 1834 }
1835 1835
1836 static bool zone_local(struct zone *local_zone, struct zone *zone) 1836 static bool zone_local(struct zone *local_zone, struct zone *zone)
1837 { 1837 {
1838 return local_zone->node == zone->node; 1838 return local_zone->node == zone->node;
1839 } 1839 }
1840 1840
1841 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1841 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1842 { 1842 {
1843 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1843 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1844 } 1844 }
1845 1845
1846 static void __paginginit init_zone_allows_reclaim(int nid) 1846 static void __paginginit init_zone_allows_reclaim(int nid)
1847 { 1847 {
1848 int i; 1848 int i;
1849 1849
1850 for_each_node_state(i, N_MEMORY) 1850 for_each_node_state(i, N_MEMORY)
1851 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1851 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1852 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1852 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1853 else 1853 else
1854 zone_reclaim_mode = 1; 1854 zone_reclaim_mode = 1;
1855 } 1855 }
1856 1856
1857 #else /* CONFIG_NUMA */ 1857 #else /* CONFIG_NUMA */
1858 1858
1859 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1859 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1860 { 1860 {
1861 return NULL; 1861 return NULL;
1862 } 1862 }
1863 1863
1864 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1864 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1865 nodemask_t *allowednodes) 1865 nodemask_t *allowednodes)
1866 { 1866 {
1867 return 1; 1867 return 1;
1868 } 1868 }
1869 1869
1870 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1870 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1871 { 1871 {
1872 } 1872 }
1873 1873
1874 static void zlc_clear_zones_full(struct zonelist *zonelist) 1874 static void zlc_clear_zones_full(struct zonelist *zonelist)
1875 { 1875 {
1876 } 1876 }
1877 1877
1878 static bool zone_local(struct zone *local_zone, struct zone *zone) 1878 static bool zone_local(struct zone *local_zone, struct zone *zone)
1879 { 1879 {
1880 return true; 1880 return true;
1881 } 1881 }
1882 1882
1883 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1883 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1884 { 1884 {
1885 return true; 1885 return true;
1886 } 1886 }
1887 1887
1888 static inline void init_zone_allows_reclaim(int nid) 1888 static inline void init_zone_allows_reclaim(int nid)
1889 { 1889 {
1890 } 1890 }
1891 #endif /* CONFIG_NUMA */ 1891 #endif /* CONFIG_NUMA */
1892 1892
1893 /* 1893 /*
1894 * get_page_from_freelist goes through the zonelist trying to allocate 1894 * get_page_from_freelist goes through the zonelist trying to allocate
1895 * a page. 1895 * a page.
1896 */ 1896 */
1897 static struct page * 1897 static struct page *
1898 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1898 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1899 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1899 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1900 struct zone *preferred_zone, int migratetype) 1900 struct zone *preferred_zone, int classzone_idx, int migratetype)
1901 { 1901 {
1902 struct zoneref *z; 1902 struct zoneref *z;
1903 struct page *page = NULL; 1903 struct page *page = NULL;
1904 int classzone_idx;
1905 struct zone *zone; 1904 struct zone *zone;
1906 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1905 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1907 int zlc_active = 0; /* set if using zonelist_cache */ 1906 int zlc_active = 0; /* set if using zonelist_cache */
1908 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1907 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1909 1908
1910 classzone_idx = zone_idx(preferred_zone);
1911 zonelist_scan: 1909 zonelist_scan:
1912 /* 1910 /*
1913 * Scan zonelist, looking for a zone with enough free. 1911 * Scan zonelist, looking for a zone with enough free.
1914 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1912 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1915 */ 1913 */
1916 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1914 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1917 high_zoneidx, nodemask) { 1915 high_zoneidx, nodemask) {
1918 unsigned long mark; 1916 unsigned long mark;
1919 1917
1920 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1918 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1921 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1919 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1922 continue; 1920 continue;
1923 if (cpusets_enabled() && 1921 if (cpusets_enabled() &&
1924 (alloc_flags & ALLOC_CPUSET) && 1922 (alloc_flags & ALLOC_CPUSET) &&
1925 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1923 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1926 continue; 1924 continue;
1927 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1925 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1928 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) 1926 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1929 goto try_this_zone; 1927 goto try_this_zone;
1930 /* 1928 /*
1931 * Distribute pages in proportion to the individual 1929 * Distribute pages in proportion to the individual
1932 * zone size to ensure fair page aging. The zone a 1930 * zone size to ensure fair page aging. The zone a
1933 * page was allocated in should have no effect on the 1931 * page was allocated in should have no effect on the
1934 * time the page has in memory before being reclaimed. 1932 * time the page has in memory before being reclaimed.
1935 */ 1933 */
1936 if (alloc_flags & ALLOC_FAIR) { 1934 if (alloc_flags & ALLOC_FAIR) {
1937 if (!zone_local(preferred_zone, zone)) 1935 if (!zone_local(preferred_zone, zone))
1938 continue; 1936 continue;
1939 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1937 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1940 continue; 1938 continue;
1941 } 1939 }
1942 /* 1940 /*
1943 * When allocating a page cache page for writing, we 1941 * When allocating a page cache page for writing, we
1944 * want to get it from a zone that is within its dirty 1942 * want to get it from a zone that is within its dirty
1945 * limit, such that no single zone holds more than its 1943 * limit, such that no single zone holds more than its
1946 * proportional share of globally allowed dirty pages. 1944 * proportional share of globally allowed dirty pages.
1947 * The dirty limits take into account the zone's 1945 * The dirty limits take into account the zone's
1948 * lowmem reserves and high watermark so that kswapd 1946 * lowmem reserves and high watermark so that kswapd
1949 * should be able to balance it without having to 1947 * should be able to balance it without having to
1950 * write pages from its LRU list. 1948 * write pages from its LRU list.
1951 * 1949 *
1952 * This may look like it could increase pressure on 1950 * This may look like it could increase pressure on
1953 * lower zones by failing allocations in higher zones 1951 * lower zones by failing allocations in higher zones
1954 * before they are full. But the pages that do spill 1952 * before they are full. But the pages that do spill
1955 * over are limited as the lower zones are protected 1953 * over are limited as the lower zones are protected
1956 * by this very same mechanism. It should not become 1954 * by this very same mechanism. It should not become
1957 * a practical burden to them. 1955 * a practical burden to them.
1958 * 1956 *
1959 * XXX: For now, allow allocations to potentially 1957 * XXX: For now, allow allocations to potentially
1960 * exceed the per-zone dirty limit in the slowpath 1958 * exceed the per-zone dirty limit in the slowpath
1961 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1959 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1962 * which is important when on a NUMA setup the allowed 1960 * which is important when on a NUMA setup the allowed
1963 * zones are together not big enough to reach the 1961 * zones are together not big enough to reach the
1964 * global limit. The proper fix for these situations 1962 * global limit. The proper fix for these situations
1965 * will require awareness of zones in the 1963 * will require awareness of zones in the
1966 * dirty-throttling and the flusher threads. 1964 * dirty-throttling and the flusher threads.
1967 */ 1965 */
1968 if ((alloc_flags & ALLOC_WMARK_LOW) && 1966 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1969 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1967 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1970 continue; 1968 continue;
1971 1969
1972 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1970 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1973 if (!zone_watermark_ok(zone, order, mark, 1971 if (!zone_watermark_ok(zone, order, mark,
1974 classzone_idx, alloc_flags)) { 1972 classzone_idx, alloc_flags)) {
1975 int ret; 1973 int ret;
1976 1974
1977 if (IS_ENABLED(CONFIG_NUMA) && 1975 if (IS_ENABLED(CONFIG_NUMA) &&
1978 !did_zlc_setup && nr_online_nodes > 1) { 1976 !did_zlc_setup && nr_online_nodes > 1) {
1979 /* 1977 /*
1980 * we do zlc_setup if there are multiple nodes 1978 * we do zlc_setup if there are multiple nodes
1981 * and before considering the first zone allowed 1979 * and before considering the first zone allowed
1982 * by the cpuset. 1980 * by the cpuset.
1983 */ 1981 */
1984 allowednodes = zlc_setup(zonelist, alloc_flags); 1982 allowednodes = zlc_setup(zonelist, alloc_flags);
1985 zlc_active = 1; 1983 zlc_active = 1;
1986 did_zlc_setup = 1; 1984 did_zlc_setup = 1;
1987 } 1985 }
1988 1986
1989 if (zone_reclaim_mode == 0 || 1987 if (zone_reclaim_mode == 0 ||
1990 !zone_allows_reclaim(preferred_zone, zone)) 1988 !zone_allows_reclaim(preferred_zone, zone))
1991 goto this_zone_full; 1989 goto this_zone_full;
1992 1990
1993 /* 1991 /*
1994 * As we may have just activated ZLC, check if the first 1992 * As we may have just activated ZLC, check if the first
1995 * eligible zone has failed zone_reclaim recently. 1993 * eligible zone has failed zone_reclaim recently.
1996 */ 1994 */
1997 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1995 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1998 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1996 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1999 continue; 1997 continue;
2000 1998
2001 ret = zone_reclaim(zone, gfp_mask, order); 1999 ret = zone_reclaim(zone, gfp_mask, order);
2002 switch (ret) { 2000 switch (ret) {
2003 case ZONE_RECLAIM_NOSCAN: 2001 case ZONE_RECLAIM_NOSCAN:
2004 /* did not scan */ 2002 /* did not scan */
2005 continue; 2003 continue;
2006 case ZONE_RECLAIM_FULL: 2004 case ZONE_RECLAIM_FULL:
2007 /* scanned but unreclaimable */ 2005 /* scanned but unreclaimable */
2008 continue; 2006 continue;
2009 default: 2007 default:
2010 /* did we reclaim enough */ 2008 /* did we reclaim enough */
2011 if (zone_watermark_ok(zone, order, mark, 2009 if (zone_watermark_ok(zone, order, mark,
2012 classzone_idx, alloc_flags)) 2010 classzone_idx, alloc_flags))
2013 goto try_this_zone; 2011 goto try_this_zone;
2014 2012
2015 /* 2013 /*
2016 * Failed to reclaim enough to meet watermark. 2014 * Failed to reclaim enough to meet watermark.
2017 * Only mark the zone full if checking the min 2015 * Only mark the zone full if checking the min
2018 * watermark or if we failed to reclaim just 2016 * watermark or if we failed to reclaim just
2019 * 1<<order pages or else the page allocator 2017 * 1<<order pages or else the page allocator
2020 * fastpath will prematurely mark zones full 2018 * fastpath will prematurely mark zones full
2021 * when the watermark is between the low and 2019 * when the watermark is between the low and
2022 * min watermarks. 2020 * min watermarks.
2023 */ 2021 */
2024 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2022 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2025 ret == ZONE_RECLAIM_SOME) 2023 ret == ZONE_RECLAIM_SOME)
2026 goto this_zone_full; 2024 goto this_zone_full;
2027 2025
2028 continue; 2026 continue;
2029 } 2027 }
2030 } 2028 }
2031 2029
2032 try_this_zone: 2030 try_this_zone:
2033 page = buffered_rmqueue(preferred_zone, zone, order, 2031 page = buffered_rmqueue(preferred_zone, zone, order,
2034 gfp_mask, migratetype); 2032 gfp_mask, migratetype);
2035 if (page) 2033 if (page)
2036 break; 2034 break;
2037 this_zone_full: 2035 this_zone_full:
2038 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2036 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2039 zlc_mark_zone_full(zonelist, z); 2037 zlc_mark_zone_full(zonelist, z);
2040 } 2038 }
2041 2039
2042 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2040 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2043 /* Disable zlc cache for second zonelist scan */ 2041 /* Disable zlc cache for second zonelist scan */
2044 zlc_active = 0; 2042 zlc_active = 0;
2045 goto zonelist_scan; 2043 goto zonelist_scan;
2046 } 2044 }
2047 2045
2048 if (page) 2046 if (page)
2049 /* 2047 /*
2050 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2048 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2051 * necessary to allocate the page. The expectation is 2049 * necessary to allocate the page. The expectation is
2052 * that the caller is taking steps that will free more 2050 * that the caller is taking steps that will free more
2053 * memory. The caller should avoid the page being used 2051 * memory. The caller should avoid the page being used
2054 * for !PFMEMALLOC purposes. 2052 * for !PFMEMALLOC purposes.
2055 */ 2053 */
2056 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2054 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2057 2055
2058 return page; 2056 return page;
2059 } 2057 }
2060 2058
2061 /* 2059 /*
2062 * Large machines with many possible nodes should not always dump per-node 2060 * Large machines with many possible nodes should not always dump per-node
2063 * meminfo in irq context. 2061 * meminfo in irq context.
2064 */ 2062 */
2065 static inline bool should_suppress_show_mem(void) 2063 static inline bool should_suppress_show_mem(void)
2066 { 2064 {
2067 bool ret = false; 2065 bool ret = false;
2068 2066
2069 #if NODES_SHIFT > 8 2067 #if NODES_SHIFT > 8
2070 ret = in_interrupt(); 2068 ret = in_interrupt();
2071 #endif 2069 #endif
2072 return ret; 2070 return ret;
2073 } 2071 }
2074 2072
2075 static DEFINE_RATELIMIT_STATE(nopage_rs, 2073 static DEFINE_RATELIMIT_STATE(nopage_rs,
2076 DEFAULT_RATELIMIT_INTERVAL, 2074 DEFAULT_RATELIMIT_INTERVAL,
2077 DEFAULT_RATELIMIT_BURST); 2075 DEFAULT_RATELIMIT_BURST);
2078 2076
2079 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2077 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2080 { 2078 {
2081 unsigned int filter = SHOW_MEM_FILTER_NODES; 2079 unsigned int filter = SHOW_MEM_FILTER_NODES;
2082 2080
2083 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2081 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2084 debug_guardpage_minorder() > 0) 2082 debug_guardpage_minorder() > 0)
2085 return; 2083 return;
2086 2084
2087 /* 2085 /*
2088 * Walking all memory to count page types is very expensive and should 2086 * Walking all memory to count page types is very expensive and should
2089 * be inhibited in non-blockable contexts. 2087 * be inhibited in non-blockable contexts.
2090 */ 2088 */
2091 if (!(gfp_mask & __GFP_WAIT)) 2089 if (!(gfp_mask & __GFP_WAIT))
2092 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2090 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2093 2091
2094 /* 2092 /*
2095 * This documents exceptions given to allocations in certain 2093 * This documents exceptions given to allocations in certain
2096 * contexts that are allowed to allocate outside current's set 2094 * contexts that are allowed to allocate outside current's set
2097 * of allowed nodes. 2095 * of allowed nodes.
2098 */ 2096 */
2099 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2097 if (!(gfp_mask & __GFP_NOMEMALLOC))
2100 if (test_thread_flag(TIF_MEMDIE) || 2098 if (test_thread_flag(TIF_MEMDIE) ||
2101 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2099 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2102 filter &= ~SHOW_MEM_FILTER_NODES; 2100 filter &= ~SHOW_MEM_FILTER_NODES;
2103 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2101 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2104 filter &= ~SHOW_MEM_FILTER_NODES; 2102 filter &= ~SHOW_MEM_FILTER_NODES;
2105 2103
2106 if (fmt) { 2104 if (fmt) {
2107 struct va_format vaf; 2105 struct va_format vaf;
2108 va_list args; 2106 va_list args;
2109 2107
2110 va_start(args, fmt); 2108 va_start(args, fmt);
2111 2109
2112 vaf.fmt = fmt; 2110 vaf.fmt = fmt;
2113 vaf.va = &args; 2111 vaf.va = &args;
2114 2112
2115 pr_warn("%pV", &vaf); 2113 pr_warn("%pV", &vaf);
2116 2114
2117 va_end(args); 2115 va_end(args);
2118 } 2116 }
2119 2117
2120 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2118 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2121 current->comm, order, gfp_mask); 2119 current->comm, order, gfp_mask);
2122 2120
2123 dump_stack(); 2121 dump_stack();
2124 if (!should_suppress_show_mem()) 2122 if (!should_suppress_show_mem())
2125 show_mem(filter); 2123 show_mem(filter);
2126 } 2124 }
2127 2125
2128 static inline int 2126 static inline int
2129 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2127 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2130 unsigned long did_some_progress, 2128 unsigned long did_some_progress,
2131 unsigned long pages_reclaimed) 2129 unsigned long pages_reclaimed)
2132 { 2130 {
2133 /* Do not loop if specifically requested */ 2131 /* Do not loop if specifically requested */
2134 if (gfp_mask & __GFP_NORETRY) 2132 if (gfp_mask & __GFP_NORETRY)
2135 return 0; 2133 return 0;
2136 2134
2137 /* Always retry if specifically requested */ 2135 /* Always retry if specifically requested */
2138 if (gfp_mask & __GFP_NOFAIL) 2136 if (gfp_mask & __GFP_NOFAIL)
2139 return 1; 2137 return 1;
2140 2138
2141 /* 2139 /*
2142 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2140 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2143 * making forward progress without invoking OOM. Suspend also disables 2141 * making forward progress without invoking OOM. Suspend also disables
2144 * storage devices so kswapd will not help. Bail if we are suspending. 2142 * storage devices so kswapd will not help. Bail if we are suspending.
2145 */ 2143 */
2146 if (!did_some_progress && pm_suspended_storage()) 2144 if (!did_some_progress && pm_suspended_storage())
2147 return 0; 2145 return 0;
2148 2146
2149 /* 2147 /*
2150 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2148 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2151 * means __GFP_NOFAIL, but that may not be true in other 2149 * means __GFP_NOFAIL, but that may not be true in other
2152 * implementations. 2150 * implementations.
2153 */ 2151 */
2154 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2152 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2155 return 1; 2153 return 1;
2156 2154
2157 /* 2155 /*
2158 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2156 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2159 * specified, then we retry until we no longer reclaim any pages 2157 * specified, then we retry until we no longer reclaim any pages
2160 * (above), or we've reclaimed an order of pages at least as 2158 * (above), or we've reclaimed an order of pages at least as
2161 * large as the allocation's order. In both cases, if the 2159 * large as the allocation's order. In both cases, if the
2162 * allocation still fails, we stop retrying. 2160 * allocation still fails, we stop retrying.
2163 */ 2161 */
2164 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2162 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2165 return 1; 2163 return 1;
2166 2164
2167 return 0; 2165 return 0;
2168 } 2166 }
2169 2167
2170 static inline struct page * 2168 static inline struct page *
2171 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2169 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2172 struct zonelist *zonelist, enum zone_type high_zoneidx, 2170 struct zonelist *zonelist, enum zone_type high_zoneidx,
2173 nodemask_t *nodemask, struct zone *preferred_zone, 2171 nodemask_t *nodemask, struct zone *preferred_zone,
2174 int migratetype) 2172 int classzone_idx, int migratetype)
2175 { 2173 {
2176 struct page *page; 2174 struct page *page;
2177 2175
2178 /* Acquire the OOM killer lock for the zones in zonelist */ 2176 /* Acquire the OOM killer lock for the zones in zonelist */
2179 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2177 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2180 schedule_timeout_uninterruptible(1); 2178 schedule_timeout_uninterruptible(1);
2181 return NULL; 2179 return NULL;
2182 } 2180 }
2183 2181
2184 /* 2182 /*
2185 * Go through the zonelist yet one more time, keep very high watermark 2183 * Go through the zonelist yet one more time, keep very high watermark
2186 * here, this is only to catch a parallel oom killing, we must fail if 2184 * here, this is only to catch a parallel oom killing, we must fail if
2187 * we're still under heavy pressure. 2185 * we're still under heavy pressure.
2188 */ 2186 */
2189 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2187 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2190 order, zonelist, high_zoneidx, 2188 order, zonelist, high_zoneidx,
2191 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2189 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2192 preferred_zone, migratetype); 2190 preferred_zone, classzone_idx, migratetype);
2193 if (page) 2191 if (page)
2194 goto out; 2192 goto out;
2195 2193
2196 if (!(gfp_mask & __GFP_NOFAIL)) { 2194 if (!(gfp_mask & __GFP_NOFAIL)) {
2197 /* The OOM killer will not help higher order allocs */ 2195 /* The OOM killer will not help higher order allocs */
2198 if (order > PAGE_ALLOC_COSTLY_ORDER) 2196 if (order > PAGE_ALLOC_COSTLY_ORDER)
2199 goto out; 2197 goto out;
2200 /* The OOM killer does not needlessly kill tasks for lowmem */ 2198 /* The OOM killer does not needlessly kill tasks for lowmem */
2201 if (high_zoneidx < ZONE_NORMAL) 2199 if (high_zoneidx < ZONE_NORMAL)
2202 goto out; 2200 goto out;
2203 /* 2201 /*
2204 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2202 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2205 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2203 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2206 * The caller should handle page allocation failure by itself if 2204 * The caller should handle page allocation failure by itself if
2207 * it specifies __GFP_THISNODE. 2205 * it specifies __GFP_THISNODE.
2208 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2206 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2209 */ 2207 */
2210 if (gfp_mask & __GFP_THISNODE) 2208 if (gfp_mask & __GFP_THISNODE)
2211 goto out; 2209 goto out;
2212 } 2210 }
2213 /* Exhausted what can be done so it's blamo time */ 2211 /* Exhausted what can be done so it's blamo time */
2214 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2212 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2215 2213
2216 out: 2214 out:
2217 clear_zonelist_oom(zonelist, gfp_mask); 2215 clear_zonelist_oom(zonelist, gfp_mask);
2218 return page; 2216 return page;
2219 } 2217 }
2220 2218
2221 #ifdef CONFIG_COMPACTION 2219 #ifdef CONFIG_COMPACTION
2222 /* Try memory compaction for high-order allocations before reclaim */ 2220 /* Try memory compaction for high-order allocations before reclaim */
2223 static struct page * 2221 static struct page *
2224 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2222 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2225 struct zonelist *zonelist, enum zone_type high_zoneidx, 2223 struct zonelist *zonelist, enum zone_type high_zoneidx,
2226 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2224 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2227 int migratetype, enum migrate_mode mode, 2225 int classzone_idx, int migratetype, enum migrate_mode mode,
2228 bool *contended_compaction, bool *deferred_compaction, 2226 bool *contended_compaction, bool *deferred_compaction,
2229 unsigned long *did_some_progress) 2227 unsigned long *did_some_progress)
2230 { 2228 {
2231 if (!order) 2229 if (!order)
2232 return NULL; 2230 return NULL;
2233 2231
2234 if (compaction_deferred(preferred_zone, order)) { 2232 if (compaction_deferred(preferred_zone, order)) {
2235 *deferred_compaction = true; 2233 *deferred_compaction = true;
2236 return NULL; 2234 return NULL;
2237 } 2235 }
2238 2236
2239 current->flags |= PF_MEMALLOC; 2237 current->flags |= PF_MEMALLOC;
2240 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2238 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2241 nodemask, mode, 2239 nodemask, mode,
2242 contended_compaction); 2240 contended_compaction);
2243 current->flags &= ~PF_MEMALLOC; 2241 current->flags &= ~PF_MEMALLOC;
2244 2242
2245 if (*did_some_progress != COMPACT_SKIPPED) { 2243 if (*did_some_progress != COMPACT_SKIPPED) {
2246 struct page *page; 2244 struct page *page;
2247 2245
2248 /* Page migration frees to the PCP lists but we want merging */ 2246 /* Page migration frees to the PCP lists but we want merging */
2249 drain_pages(get_cpu()); 2247 drain_pages(get_cpu());
2250 put_cpu(); 2248 put_cpu();
2251 2249
2252 page = get_page_from_freelist(gfp_mask, nodemask, 2250 page = get_page_from_freelist(gfp_mask, nodemask,
2253 order, zonelist, high_zoneidx, 2251 order, zonelist, high_zoneidx,
2254 alloc_flags & ~ALLOC_NO_WATERMARKS, 2252 alloc_flags & ~ALLOC_NO_WATERMARKS,
2255 preferred_zone, migratetype); 2253 preferred_zone, classzone_idx, migratetype);
2256 if (page) { 2254 if (page) {
2257 preferred_zone->compact_blockskip_flush = false; 2255 preferred_zone->compact_blockskip_flush = false;
2258 compaction_defer_reset(preferred_zone, order, true); 2256 compaction_defer_reset(preferred_zone, order, true);
2259 count_vm_event(COMPACTSUCCESS); 2257 count_vm_event(COMPACTSUCCESS);
2260 return page; 2258 return page;
2261 } 2259 }
2262 2260
2263 /* 2261 /*
2264 * It's bad if compaction run occurs and fails. 2262 * It's bad if compaction run occurs and fails.
2265 * The most likely reason is that pages exist, 2263 * The most likely reason is that pages exist,
2266 * but not enough to satisfy watermarks. 2264 * but not enough to satisfy watermarks.
2267 */ 2265 */
2268 count_vm_event(COMPACTFAIL); 2266 count_vm_event(COMPACTFAIL);
2269 2267
2270 /* 2268 /*
2271 * As async compaction considers a subset of pageblocks, only 2269 * As async compaction considers a subset of pageblocks, only
2272 * defer if the failure was a sync compaction failure. 2270 * defer if the failure was a sync compaction failure.
2273 */ 2271 */
2274 if (mode != MIGRATE_ASYNC) 2272 if (mode != MIGRATE_ASYNC)
2275 defer_compaction(preferred_zone, order); 2273 defer_compaction(preferred_zone, order);
2276 2274
2277 cond_resched(); 2275 cond_resched();
2278 } 2276 }
2279 2277
2280 return NULL; 2278 return NULL;
2281 } 2279 }
2282 #else 2280 #else
2283 static inline struct page * 2281 static inline struct page *
2284 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2282 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2285 struct zonelist *zonelist, enum zone_type high_zoneidx, 2283 struct zonelist *zonelist, enum zone_type high_zoneidx,
2286 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2284 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2287 int migratetype, enum migrate_mode mode, bool *contended_compaction, 2285 int classzone_idx, int migratetype,
2286 enum migrate_mode mode, bool *contended_compaction,
2288 bool *deferred_compaction, unsigned long *did_some_progress) 2287 bool *deferred_compaction, unsigned long *did_some_progress)
2289 { 2288 {
2290 return NULL; 2289 return NULL;
2291 } 2290 }
2292 #endif /* CONFIG_COMPACTION */ 2291 #endif /* CONFIG_COMPACTION */
2293 2292
2294 /* Perform direct synchronous page reclaim */ 2293 /* Perform direct synchronous page reclaim */
2295 static int 2294 static int
2296 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2295 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2297 nodemask_t *nodemask) 2296 nodemask_t *nodemask)
2298 { 2297 {
2299 struct reclaim_state reclaim_state; 2298 struct reclaim_state reclaim_state;
2300 int progress; 2299 int progress;
2301 2300
2302 cond_resched(); 2301 cond_resched();
2303 2302
2304 /* We now go into synchronous reclaim */ 2303 /* We now go into synchronous reclaim */
2305 cpuset_memory_pressure_bump(); 2304 cpuset_memory_pressure_bump();
2306 current->flags |= PF_MEMALLOC; 2305 current->flags |= PF_MEMALLOC;
2307 lockdep_set_current_reclaim_state(gfp_mask); 2306 lockdep_set_current_reclaim_state(gfp_mask);
2308 reclaim_state.reclaimed_slab = 0; 2307 reclaim_state.reclaimed_slab = 0;
2309 current->reclaim_state = &reclaim_state; 2308 current->reclaim_state = &reclaim_state;
2310 2309
2311 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2310 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2312 2311
2313 current->reclaim_state = NULL; 2312 current->reclaim_state = NULL;
2314 lockdep_clear_current_reclaim_state(); 2313 lockdep_clear_current_reclaim_state();
2315 current->flags &= ~PF_MEMALLOC; 2314 current->flags &= ~PF_MEMALLOC;
2316 2315
2317 cond_resched(); 2316 cond_resched();
2318 2317
2319 return progress; 2318 return progress;
2320 } 2319 }
2321 2320
2322 /* The really slow allocator path where we enter direct reclaim */ 2321 /* The really slow allocator path where we enter direct reclaim */
2323 static inline struct page * 2322 static inline struct page *
2324 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2323 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2325 struct zonelist *zonelist, enum zone_type high_zoneidx, 2324 struct zonelist *zonelist, enum zone_type high_zoneidx,
2326 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2325 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2327 int migratetype, unsigned long *did_some_progress) 2326 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2328 { 2327 {
2329 struct page *page = NULL; 2328 struct page *page = NULL;
2330 bool drained = false; 2329 bool drained = false;
2331 2330
2332 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2331 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2333 nodemask); 2332 nodemask);
2334 if (unlikely(!(*did_some_progress))) 2333 if (unlikely(!(*did_some_progress)))
2335 return NULL; 2334 return NULL;
2336 2335
2337 /* After successful reclaim, reconsider all zones for allocation */ 2336 /* After successful reclaim, reconsider all zones for allocation */
2338 if (IS_ENABLED(CONFIG_NUMA)) 2337 if (IS_ENABLED(CONFIG_NUMA))
2339 zlc_clear_zones_full(zonelist); 2338 zlc_clear_zones_full(zonelist);
2340 2339
2341 retry: 2340 retry:
2342 page = get_page_from_freelist(gfp_mask, nodemask, order, 2341 page = get_page_from_freelist(gfp_mask, nodemask, order,
2343 zonelist, high_zoneidx, 2342 zonelist, high_zoneidx,
2344 alloc_flags & ~ALLOC_NO_WATERMARKS, 2343 alloc_flags & ~ALLOC_NO_WATERMARKS,
2345 preferred_zone, migratetype); 2344 preferred_zone, classzone_idx,
2345 migratetype);
2346 2346
2347 /* 2347 /*
2348 * If an allocation failed after direct reclaim, it could be because 2348 * If an allocation failed after direct reclaim, it could be because
2349 * pages are pinned on the per-cpu lists. Drain them and try again 2349 * pages are pinned on the per-cpu lists. Drain them and try again
2350 */ 2350 */
2351 if (!page && !drained) { 2351 if (!page && !drained) {
2352 drain_all_pages(); 2352 drain_all_pages();
2353 drained = true; 2353 drained = true;
2354 goto retry; 2354 goto retry;
2355 } 2355 }
2356 2356
2357 return page; 2357 return page;
2358 } 2358 }
2359 2359
2360 /* 2360 /*
2361 * This is called in the allocator slow-path if the allocation request is of 2361 * This is called in the allocator slow-path if the allocation request is of
2362 * sufficient urgency to ignore watermarks and take other desperate measures 2362 * sufficient urgency to ignore watermarks and take other desperate measures
2363 */ 2363 */
2364 static inline struct page * 2364 static inline struct page *
2365 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2365 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2366 struct zonelist *zonelist, enum zone_type high_zoneidx, 2366 struct zonelist *zonelist, enum zone_type high_zoneidx,
2367 nodemask_t *nodemask, struct zone *preferred_zone, 2367 nodemask_t *nodemask, struct zone *preferred_zone,
2368 int migratetype) 2368 int classzone_idx, int migratetype)
2369 { 2369 {
2370 struct page *page; 2370 struct page *page;
2371 2371
2372 do { 2372 do {
2373 page = get_page_from_freelist(gfp_mask, nodemask, order, 2373 page = get_page_from_freelist(gfp_mask, nodemask, order,
2374 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2374 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2375 preferred_zone, migratetype); 2375 preferred_zone, classzone_idx, migratetype);
2376 2376
2377 if (!page && gfp_mask & __GFP_NOFAIL) 2377 if (!page && gfp_mask & __GFP_NOFAIL)
2378 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2378 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2379 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2379 } while (!page && (gfp_mask & __GFP_NOFAIL));
2380 2380
2381 return page; 2381 return page;
2382 } 2382 }
2383 2383
2384 static void reset_alloc_batches(struct zonelist *zonelist, 2384 static void reset_alloc_batches(struct zonelist *zonelist,
2385 enum zone_type high_zoneidx, 2385 enum zone_type high_zoneidx,
2386 struct zone *preferred_zone) 2386 struct zone *preferred_zone)
2387 { 2387 {
2388 struct zoneref *z; 2388 struct zoneref *z;
2389 struct zone *zone; 2389 struct zone *zone;
2390 2390
2391 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2391 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2392 /* 2392 /*
2393 * Only reset the batches of zones that were actually 2393 * Only reset the batches of zones that were actually
2394 * considered in the fairness pass, we don't want to 2394 * considered in the fairness pass, we don't want to
2395 * trash fairness information for zones that are not 2395 * trash fairness information for zones that are not
2396 * actually part of this zonelist's round-robin cycle. 2396 * actually part of this zonelist's round-robin cycle.
2397 */ 2397 */
2398 if (!zone_local(preferred_zone, zone)) 2398 if (!zone_local(preferred_zone, zone))
2399 continue; 2399 continue;
2400 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2400 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2401 high_wmark_pages(zone) - low_wmark_pages(zone) - 2401 high_wmark_pages(zone) - low_wmark_pages(zone) -
2402 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2402 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2403 } 2403 }
2404 } 2404 }
2405 2405
2406 static void wake_all_kswapds(unsigned int order, 2406 static void wake_all_kswapds(unsigned int order,
2407 struct zonelist *zonelist, 2407 struct zonelist *zonelist,
2408 enum zone_type high_zoneidx, 2408 enum zone_type high_zoneidx,
2409 struct zone *preferred_zone) 2409 struct zone *preferred_zone)
2410 { 2410 {
2411 struct zoneref *z; 2411 struct zoneref *z;
2412 struct zone *zone; 2412 struct zone *zone;
2413 2413
2414 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2414 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2415 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2415 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2416 } 2416 }
2417 2417
2418 static inline int 2418 static inline int
2419 gfp_to_alloc_flags(gfp_t gfp_mask) 2419 gfp_to_alloc_flags(gfp_t gfp_mask)
2420 { 2420 {
2421 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2421 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2422 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); 2422 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
2423 2423
2424 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2424 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2425 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2425 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2426 2426
2427 /* 2427 /*
2428 * The caller may dip into page reserves a bit more if the caller 2428 * The caller may dip into page reserves a bit more if the caller
2429 * cannot run direct reclaim, or if the caller has realtime scheduling 2429 * cannot run direct reclaim, or if the caller has realtime scheduling
2430 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2430 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2431 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). 2431 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
2432 */ 2432 */
2433 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2433 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2434 2434
2435 if (atomic) { 2435 if (atomic) {
2436 /* 2436 /*
2437 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2437 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2438 * if it can't schedule. 2438 * if it can't schedule.
2439 */ 2439 */
2440 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2440 if (!(gfp_mask & __GFP_NOMEMALLOC))
2441 alloc_flags |= ALLOC_HARDER; 2441 alloc_flags |= ALLOC_HARDER;
2442 /* 2442 /*
2443 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2443 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2444 * comment for __cpuset_node_allowed_softwall(). 2444 * comment for __cpuset_node_allowed_softwall().
2445 */ 2445 */
2446 alloc_flags &= ~ALLOC_CPUSET; 2446 alloc_flags &= ~ALLOC_CPUSET;
2447 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2447 } else if (unlikely(rt_task(current)) && !in_interrupt())
2448 alloc_flags |= ALLOC_HARDER; 2448 alloc_flags |= ALLOC_HARDER;
2449 2449
2450 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2450 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2451 if (gfp_mask & __GFP_MEMALLOC) 2451 if (gfp_mask & __GFP_MEMALLOC)
2452 alloc_flags |= ALLOC_NO_WATERMARKS; 2452 alloc_flags |= ALLOC_NO_WATERMARKS;
2453 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2453 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2454 alloc_flags |= ALLOC_NO_WATERMARKS; 2454 alloc_flags |= ALLOC_NO_WATERMARKS;
2455 else if (!in_interrupt() && 2455 else if (!in_interrupt() &&
2456 ((current->flags & PF_MEMALLOC) || 2456 ((current->flags & PF_MEMALLOC) ||
2457 unlikely(test_thread_flag(TIF_MEMDIE)))) 2457 unlikely(test_thread_flag(TIF_MEMDIE))))
2458 alloc_flags |= ALLOC_NO_WATERMARKS; 2458 alloc_flags |= ALLOC_NO_WATERMARKS;
2459 } 2459 }
2460 #ifdef CONFIG_CMA 2460 #ifdef CONFIG_CMA
2461 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2461 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2462 alloc_flags |= ALLOC_CMA; 2462 alloc_flags |= ALLOC_CMA;
2463 #endif 2463 #endif
2464 return alloc_flags; 2464 return alloc_flags;
2465 } 2465 }
2466 2466
2467 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2467 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2468 { 2468 {
2469 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2469 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2470 } 2470 }
2471 2471
2472 static inline struct page * 2472 static inline struct page *
2473 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2473 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2474 struct zonelist *zonelist, enum zone_type high_zoneidx, 2474 struct zonelist *zonelist, enum zone_type high_zoneidx,
2475 nodemask_t *nodemask, struct zone *preferred_zone, 2475 nodemask_t *nodemask, struct zone *preferred_zone,
2476 int migratetype) 2476 int classzone_idx, int migratetype)
2477 { 2477 {
2478 const gfp_t wait = gfp_mask & __GFP_WAIT; 2478 const gfp_t wait = gfp_mask & __GFP_WAIT;
2479 struct page *page = NULL; 2479 struct page *page = NULL;
2480 int alloc_flags; 2480 int alloc_flags;
2481 unsigned long pages_reclaimed = 0; 2481 unsigned long pages_reclaimed = 0;
2482 unsigned long did_some_progress; 2482 unsigned long did_some_progress;
2483 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2483 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2484 bool deferred_compaction = false; 2484 bool deferred_compaction = false;
2485 bool contended_compaction = false; 2485 bool contended_compaction = false;
2486 2486
2487 /* 2487 /*
2488 * In the slowpath, we sanity check order to avoid ever trying to 2488 * In the slowpath, we sanity check order to avoid ever trying to
2489 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2489 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2490 * be using allocators in order of preference for an area that is 2490 * be using allocators in order of preference for an area that is
2491 * too large. 2491 * too large.
2492 */ 2492 */
2493 if (order >= MAX_ORDER) { 2493 if (order >= MAX_ORDER) {
2494 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2494 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2495 return NULL; 2495 return NULL;
2496 } 2496 }
2497 2497
2498 /* 2498 /*
2499 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2499 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2500 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2500 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2501 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2501 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2502 * using a larger set of nodes after it has established that the 2502 * using a larger set of nodes after it has established that the
2503 * allowed per node queues are empty and that nodes are 2503 * allowed per node queues are empty and that nodes are
2504 * over allocated. 2504 * over allocated.
2505 */ 2505 */
2506 if (IS_ENABLED(CONFIG_NUMA) && 2506 if (IS_ENABLED(CONFIG_NUMA) &&
2507 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2507 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2508 goto nopage; 2508 goto nopage;
2509 2509
2510 restart: 2510 restart:
2511 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2511 if (!(gfp_mask & __GFP_NO_KSWAPD))
2512 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2512 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2513 2513
2514 /* 2514 /*
2515 * OK, we're below the kswapd watermark and have kicked background 2515 * OK, we're below the kswapd watermark and have kicked background
2516 * reclaim. Now things get more complex, so set up alloc_flags according 2516 * reclaim. Now things get more complex, so set up alloc_flags according
2517 * to how we want to proceed. 2517 * to how we want to proceed.
2518 */ 2518 */
2519 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2519 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2520 2520
2521 /* 2521 /*
2522 * Find the true preferred zone if the allocation is unconstrained by 2522 * Find the true preferred zone if the allocation is unconstrained by
2523 * cpusets. 2523 * cpusets.
2524 */ 2524 */
2525 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2525 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2526 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2526 struct zoneref *preferred_zoneref;
2527 &preferred_zone); 2527 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2528 NULL,
2529 &preferred_zone);
2530 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2531 }
2528 2532
2529 rebalance: 2533 rebalance:
2530 /* This is the last chance, in general, before the goto nopage. */ 2534 /* This is the last chance, in general, before the goto nopage. */
2531 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2535 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2532 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2536 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2533 preferred_zone, migratetype); 2537 preferred_zone, classzone_idx, migratetype);
2534 if (page) 2538 if (page)
2535 goto got_pg; 2539 goto got_pg;
2536 2540
2537 /* Allocate without watermarks if the context allows */ 2541 /* Allocate without watermarks if the context allows */
2538 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2542 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2539 /* 2543 /*
2540 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2544 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2541 * the allocation is high priority and these type of 2545 * the allocation is high priority and these type of
2542 * allocations are system rather than user orientated 2546 * allocations are system rather than user orientated
2543 */ 2547 */
2544 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2548 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2545 2549
2546 page = __alloc_pages_high_priority(gfp_mask, order, 2550 page = __alloc_pages_high_priority(gfp_mask, order,
2547 zonelist, high_zoneidx, nodemask, 2551 zonelist, high_zoneidx, nodemask,
2548 preferred_zone, migratetype); 2552 preferred_zone, classzone_idx, migratetype);
2549 if (page) { 2553 if (page) {
2550 goto got_pg; 2554 goto got_pg;
2551 } 2555 }
2552 } 2556 }
2553 2557
2554 /* Atomic allocations - we can't balance anything */ 2558 /* Atomic allocations - we can't balance anything */
2555 if (!wait) 2559 if (!wait)
2556 goto nopage; 2560 goto nopage;
2557 2561
2558 /* Avoid recursion of direct reclaim */ 2562 /* Avoid recursion of direct reclaim */
2559 if (current->flags & PF_MEMALLOC) 2563 if (current->flags & PF_MEMALLOC)
2560 goto nopage; 2564 goto nopage;
2561 2565
2562 /* Avoid allocations with no watermarks from looping endlessly */ 2566 /* Avoid allocations with no watermarks from looping endlessly */
2563 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2567 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2564 goto nopage; 2568 goto nopage;
2565 2569
2566 /* 2570 /*
2567 * Try direct compaction. The first pass is asynchronous. Subsequent 2571 * Try direct compaction. The first pass is asynchronous. Subsequent
2568 * attempts after direct reclaim are synchronous 2572 * attempts after direct reclaim are synchronous
2569 */ 2573 */
2570 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2574 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2571 high_zoneidx, nodemask, alloc_flags, 2575 high_zoneidx, nodemask, alloc_flags,
2572 preferred_zone, migratetype, 2576 preferred_zone,
2577 classzone_idx, migratetype,
2573 migration_mode, &contended_compaction, 2578 migration_mode, &contended_compaction,
2574 &deferred_compaction, 2579 &deferred_compaction,
2575 &did_some_progress); 2580 &did_some_progress);
2576 if (page) 2581 if (page)
2577 goto got_pg; 2582 goto got_pg;
2578 migration_mode = MIGRATE_SYNC_LIGHT; 2583 migration_mode = MIGRATE_SYNC_LIGHT;
2579 2584
2580 /* 2585 /*
2581 * If compaction is deferred for high-order allocations, it is because 2586 * If compaction is deferred for high-order allocations, it is because
2582 * sync compaction recently failed. In this is the case and the caller 2587 * sync compaction recently failed. In this is the case and the caller
2583 * requested a movable allocation that does not heavily disrupt the 2588 * requested a movable allocation that does not heavily disrupt the
2584 * system then fail the allocation instead of entering direct reclaim. 2589 * system then fail the allocation instead of entering direct reclaim.
2585 */ 2590 */
2586 if ((deferred_compaction || contended_compaction) && 2591 if ((deferred_compaction || contended_compaction) &&
2587 (gfp_mask & __GFP_NO_KSWAPD)) 2592 (gfp_mask & __GFP_NO_KSWAPD))
2588 goto nopage; 2593 goto nopage;
2589 2594
2590 /* Try direct reclaim and then allocating */ 2595 /* Try direct reclaim and then allocating */
2591 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2596 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2592 zonelist, high_zoneidx, 2597 zonelist, high_zoneidx,
2593 nodemask, 2598 nodemask,
2594 alloc_flags, preferred_zone, 2599 alloc_flags, preferred_zone,
2595 migratetype, &did_some_progress); 2600 classzone_idx, migratetype,
2601 &did_some_progress);
2596 if (page) 2602 if (page)
2597 goto got_pg; 2603 goto got_pg;
2598 2604
2599 /* 2605 /*
2600 * If we failed to make any progress reclaiming, then we are 2606 * If we failed to make any progress reclaiming, then we are
2601 * running out of options and have to consider going OOM 2607 * running out of options and have to consider going OOM
2602 */ 2608 */
2603 if (!did_some_progress) { 2609 if (!did_some_progress) {
2604 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2610 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2605 if (oom_killer_disabled) 2611 if (oom_killer_disabled)
2606 goto nopage; 2612 goto nopage;
2607 /* Coredumps can quickly deplete all memory reserves */ 2613 /* Coredumps can quickly deplete all memory reserves */
2608 if ((current->flags & PF_DUMPCORE) && 2614 if ((current->flags & PF_DUMPCORE) &&
2609 !(gfp_mask & __GFP_NOFAIL)) 2615 !(gfp_mask & __GFP_NOFAIL))
2610 goto nopage; 2616 goto nopage;
2611 page = __alloc_pages_may_oom(gfp_mask, order, 2617 page = __alloc_pages_may_oom(gfp_mask, order,
2612 zonelist, high_zoneidx, 2618 zonelist, high_zoneidx,
2613 nodemask, preferred_zone, 2619 nodemask, preferred_zone,
2614 migratetype); 2620 classzone_idx, migratetype);
2615 if (page) 2621 if (page)
2616 goto got_pg; 2622 goto got_pg;
2617 2623
2618 if (!(gfp_mask & __GFP_NOFAIL)) { 2624 if (!(gfp_mask & __GFP_NOFAIL)) {
2619 /* 2625 /*
2620 * The oom killer is not called for high-order 2626 * The oom killer is not called for high-order
2621 * allocations that may fail, so if no progress 2627 * allocations that may fail, so if no progress
2622 * is being made, there are no other options and 2628 * is being made, there are no other options and
2623 * retrying is unlikely to help. 2629 * retrying is unlikely to help.
2624 */ 2630 */
2625 if (order > PAGE_ALLOC_COSTLY_ORDER) 2631 if (order > PAGE_ALLOC_COSTLY_ORDER)
2626 goto nopage; 2632 goto nopage;
2627 /* 2633 /*
2628 * The oom killer is not called for lowmem 2634 * The oom killer is not called for lowmem
2629 * allocations to prevent needlessly killing 2635 * allocations to prevent needlessly killing
2630 * innocent tasks. 2636 * innocent tasks.
2631 */ 2637 */
2632 if (high_zoneidx < ZONE_NORMAL) 2638 if (high_zoneidx < ZONE_NORMAL)
2633 goto nopage; 2639 goto nopage;
2634 } 2640 }
2635 2641
2636 goto restart; 2642 goto restart;
2637 } 2643 }
2638 } 2644 }
2639 2645
2640 /* Check if we should retry the allocation */ 2646 /* Check if we should retry the allocation */
2641 pages_reclaimed += did_some_progress; 2647 pages_reclaimed += did_some_progress;
2642 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2648 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2643 pages_reclaimed)) { 2649 pages_reclaimed)) {
2644 /* Wait for some write requests to complete then retry */ 2650 /* Wait for some write requests to complete then retry */
2645 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2651 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2646 goto rebalance; 2652 goto rebalance;
2647 } else { 2653 } else {
2648 /* 2654 /*
2649 * High-order allocations do not necessarily loop after 2655 * High-order allocations do not necessarily loop after
2650 * direct reclaim and reclaim/compaction depends on compaction 2656 * direct reclaim and reclaim/compaction depends on compaction
2651 * being called after reclaim so call directly if necessary 2657 * being called after reclaim so call directly if necessary
2652 */ 2658 */
2653 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2659 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2654 high_zoneidx, nodemask, alloc_flags, 2660 high_zoneidx, nodemask, alloc_flags,
2655 preferred_zone, migratetype, 2661 preferred_zone,
2662 classzone_idx, migratetype,
2656 migration_mode, &contended_compaction, 2663 migration_mode, &contended_compaction,
2657 &deferred_compaction, 2664 &deferred_compaction,
2658 &did_some_progress); 2665 &did_some_progress);
2659 if (page) 2666 if (page)
2660 goto got_pg; 2667 goto got_pg;
2661 } 2668 }
2662 2669
2663 nopage: 2670 nopage:
2664 warn_alloc_failed(gfp_mask, order, NULL); 2671 warn_alloc_failed(gfp_mask, order, NULL);
2665 return page; 2672 return page;
2666 got_pg: 2673 got_pg:
2667 if (kmemcheck_enabled) 2674 if (kmemcheck_enabled)
2668 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2675 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2669 2676
2670 return page; 2677 return page;
2671 } 2678 }
2672 2679
2673 /* 2680 /*
2674 * This is the 'heart' of the zoned buddy allocator. 2681 * This is the 'heart' of the zoned buddy allocator.
2675 */ 2682 */
2676 struct page * 2683 struct page *
2677 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2684 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2678 struct zonelist *zonelist, nodemask_t *nodemask) 2685 struct zonelist *zonelist, nodemask_t *nodemask)
2679 { 2686 {
2680 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2687 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2681 struct zone *preferred_zone; 2688 struct zone *preferred_zone;
2689 struct zoneref *preferred_zoneref;
2682 struct page *page = NULL; 2690 struct page *page = NULL;
2683 int migratetype = allocflags_to_migratetype(gfp_mask); 2691 int migratetype = allocflags_to_migratetype(gfp_mask);
2684 unsigned int cpuset_mems_cookie; 2692 unsigned int cpuset_mems_cookie;
2685 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2693 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2686 struct mem_cgroup *memcg = NULL; 2694 struct mem_cgroup *memcg = NULL;
2695 int classzone_idx;
2687 2696
2688 gfp_mask &= gfp_allowed_mask; 2697 gfp_mask &= gfp_allowed_mask;
2689 2698
2690 lockdep_trace_alloc(gfp_mask); 2699 lockdep_trace_alloc(gfp_mask);
2691 2700
2692 might_sleep_if(gfp_mask & __GFP_WAIT); 2701 might_sleep_if(gfp_mask & __GFP_WAIT);
2693 2702
2694 if (should_fail_alloc_page(gfp_mask, order)) 2703 if (should_fail_alloc_page(gfp_mask, order))
2695 return NULL; 2704 return NULL;
2696 2705
2697 /* 2706 /*
2698 * Check the zones suitable for the gfp_mask contain at least one 2707 * Check the zones suitable for the gfp_mask contain at least one
2699 * valid zone. It's possible to have an empty zonelist as a result 2708 * valid zone. It's possible to have an empty zonelist as a result
2700 * of GFP_THISNODE and a memoryless node 2709 * of GFP_THISNODE and a memoryless node
2701 */ 2710 */
2702 if (unlikely(!zonelist->_zonerefs->zone)) 2711 if (unlikely(!zonelist->_zonerefs->zone))
2703 return NULL; 2712 return NULL;
2704 2713
2705 /* 2714 /*
2706 * Will only have any effect when __GFP_KMEMCG is set. This is 2715 * Will only have any effect when __GFP_KMEMCG is set. This is
2707 * verified in the (always inline) callee 2716 * verified in the (always inline) callee
2708 */ 2717 */
2709 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2718 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2710 return NULL; 2719 return NULL;
2711 2720
2712 retry_cpuset: 2721 retry_cpuset:
2713 cpuset_mems_cookie = read_mems_allowed_begin(); 2722 cpuset_mems_cookie = read_mems_allowed_begin();
2714 2723
2715 /* The preferred zone is used for statistics later */ 2724 /* The preferred zone is used for statistics later */
2716 first_zones_zonelist(zonelist, high_zoneidx, 2725 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2717 nodemask ? : &cpuset_current_mems_allowed, 2726 nodemask ? : &cpuset_current_mems_allowed,
2718 &preferred_zone); 2727 &preferred_zone);
2719 if (!preferred_zone) 2728 if (!preferred_zone)
2720 goto out; 2729 goto out;
2730 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2721 2731
2722 #ifdef CONFIG_CMA 2732 #ifdef CONFIG_CMA
2723 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2733 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2724 alloc_flags |= ALLOC_CMA; 2734 alloc_flags |= ALLOC_CMA;
2725 #endif 2735 #endif
2726 retry: 2736 retry:
2727 /* First allocation attempt */ 2737 /* First allocation attempt */
2728 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2738 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2729 zonelist, high_zoneidx, alloc_flags, 2739 zonelist, high_zoneidx, alloc_flags,
2730 preferred_zone, migratetype); 2740 preferred_zone, classzone_idx, migratetype);
2731 if (unlikely(!page)) { 2741 if (unlikely(!page)) {
2732 /* 2742 /*
2733 * The first pass makes sure allocations are spread 2743 * The first pass makes sure allocations are spread
2734 * fairly within the local node. However, the local 2744 * fairly within the local node. However, the local
2735 * node might have free pages left after the fairness 2745 * node might have free pages left after the fairness
2736 * batches are exhausted, and remote zones haven't 2746 * batches are exhausted, and remote zones haven't
2737 * even been considered yet. Try once more without 2747 * even been considered yet. Try once more without
2738 * fairness, and include remote zones now, before 2748 * fairness, and include remote zones now, before
2739 * entering the slowpath and waking kswapd: prefer 2749 * entering the slowpath and waking kswapd: prefer
2740 * spilling to a remote zone over swapping locally. 2750 * spilling to a remote zone over swapping locally.
2741 */ 2751 */
2742 if (alloc_flags & ALLOC_FAIR) { 2752 if (alloc_flags & ALLOC_FAIR) {
2743 reset_alloc_batches(zonelist, high_zoneidx, 2753 reset_alloc_batches(zonelist, high_zoneidx,
2744 preferred_zone); 2754 preferred_zone);
2745 alloc_flags &= ~ALLOC_FAIR; 2755 alloc_flags &= ~ALLOC_FAIR;
2746 goto retry; 2756 goto retry;
2747 } 2757 }
2748 /* 2758 /*
2749 * Runtime PM, block IO and its error handling path 2759 * Runtime PM, block IO and its error handling path
2750 * can deadlock because I/O on the device might not 2760 * can deadlock because I/O on the device might not
2751 * complete. 2761 * complete.
2752 */ 2762 */
2753 gfp_mask = memalloc_noio_flags(gfp_mask); 2763 gfp_mask = memalloc_noio_flags(gfp_mask);
2754 page = __alloc_pages_slowpath(gfp_mask, order, 2764 page = __alloc_pages_slowpath(gfp_mask, order,
2755 zonelist, high_zoneidx, nodemask, 2765 zonelist, high_zoneidx, nodemask,
2756 preferred_zone, migratetype); 2766 preferred_zone, classzone_idx, migratetype);
2757 } 2767 }
2758 2768
2759 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2769 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2760 2770
2761 out: 2771 out:
2762 /* 2772 /*
2763 * When updating a task's mems_allowed, it is possible to race with 2773 * When updating a task's mems_allowed, it is possible to race with
2764 * parallel threads in such a way that an allocation can fail while 2774 * parallel threads in such a way that an allocation can fail while
2765 * the mask is being updated. If a page allocation is about to fail, 2775 * the mask is being updated. If a page allocation is about to fail,
2766 * check if the cpuset changed during allocation and if so, retry. 2776 * check if the cpuset changed during allocation and if so, retry.
2767 */ 2777 */
2768 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2778 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2769 goto retry_cpuset; 2779 goto retry_cpuset;
2770 2780
2771 memcg_kmem_commit_charge(page, memcg, order); 2781 memcg_kmem_commit_charge(page, memcg, order);
2772 2782
2773 return page; 2783 return page;
2774 } 2784 }
2775 EXPORT_SYMBOL(__alloc_pages_nodemask); 2785 EXPORT_SYMBOL(__alloc_pages_nodemask);
2776 2786
2777 /* 2787 /*
2778 * Common helper functions. 2788 * Common helper functions.
2779 */ 2789 */
2780 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2790 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2781 { 2791 {
2782 struct page *page; 2792 struct page *page;
2783 2793
2784 /* 2794 /*
2785 * __get_free_pages() returns a 32-bit address, which cannot represent 2795 * __get_free_pages() returns a 32-bit address, which cannot represent
2786 * a highmem page 2796 * a highmem page
2787 */ 2797 */
2788 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2798 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2789 2799
2790 page = alloc_pages(gfp_mask, order); 2800 page = alloc_pages(gfp_mask, order);
2791 if (!page) 2801 if (!page)
2792 return 0; 2802 return 0;
2793 return (unsigned long) page_address(page); 2803 return (unsigned long) page_address(page);
2794 } 2804 }
2795 EXPORT_SYMBOL(__get_free_pages); 2805 EXPORT_SYMBOL(__get_free_pages);
2796 2806
2797 unsigned long get_zeroed_page(gfp_t gfp_mask) 2807 unsigned long get_zeroed_page(gfp_t gfp_mask)
2798 { 2808 {
2799 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2809 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2800 } 2810 }
2801 EXPORT_SYMBOL(get_zeroed_page); 2811 EXPORT_SYMBOL(get_zeroed_page);
2802 2812
2803 void __free_pages(struct page *page, unsigned int order) 2813 void __free_pages(struct page *page, unsigned int order)
2804 { 2814 {
2805 if (put_page_testzero(page)) { 2815 if (put_page_testzero(page)) {
2806 if (order == 0) 2816 if (order == 0)
2807 free_hot_cold_page(page, 0); 2817 free_hot_cold_page(page, 0);
2808 else 2818 else
2809 __free_pages_ok(page, order); 2819 __free_pages_ok(page, order);
2810 } 2820 }
2811 } 2821 }
2812 2822
2813 EXPORT_SYMBOL(__free_pages); 2823 EXPORT_SYMBOL(__free_pages);
2814 2824
2815 void free_pages(unsigned long addr, unsigned int order) 2825 void free_pages(unsigned long addr, unsigned int order)
2816 { 2826 {
2817 if (addr != 0) { 2827 if (addr != 0) {
2818 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2828 VM_BUG_ON(!virt_addr_valid((void *)addr));
2819 __free_pages(virt_to_page((void *)addr), order); 2829 __free_pages(virt_to_page((void *)addr), order);
2820 } 2830 }
2821 } 2831 }
2822 2832
2823 EXPORT_SYMBOL(free_pages); 2833 EXPORT_SYMBOL(free_pages);
2824 2834
2825 /* 2835 /*
2826 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2836 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2827 * pages allocated with __GFP_KMEMCG. 2837 * pages allocated with __GFP_KMEMCG.
2828 * 2838 *
2829 * Those pages are accounted to a particular memcg, embedded in the 2839 * Those pages are accounted to a particular memcg, embedded in the
2830 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2840 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2831 * for that information only to find out that it is NULL for users who have no 2841 * for that information only to find out that it is NULL for users who have no
2832 * interest in that whatsoever, we provide these functions. 2842 * interest in that whatsoever, we provide these functions.
2833 * 2843 *
2834 * The caller knows better which flags it relies on. 2844 * The caller knows better which flags it relies on.
2835 */ 2845 */
2836 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2846 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2837 { 2847 {
2838 memcg_kmem_uncharge_pages(page, order); 2848 memcg_kmem_uncharge_pages(page, order);
2839 __free_pages(page, order); 2849 __free_pages(page, order);
2840 } 2850 }
2841 2851
2842 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2852 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2843 { 2853 {
2844 if (addr != 0) { 2854 if (addr != 0) {
2845 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2855 VM_BUG_ON(!virt_addr_valid((void *)addr));
2846 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2856 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2847 } 2857 }
2848 } 2858 }
2849 2859
2850 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2860 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2851 { 2861 {
2852 if (addr) { 2862 if (addr) {
2853 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2863 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2854 unsigned long used = addr + PAGE_ALIGN(size); 2864 unsigned long used = addr + PAGE_ALIGN(size);
2855 2865
2856 split_page(virt_to_page((void *)addr), order); 2866 split_page(virt_to_page((void *)addr), order);
2857 while (used < alloc_end) { 2867 while (used < alloc_end) {
2858 free_page(used); 2868 free_page(used);
2859 used += PAGE_SIZE; 2869 used += PAGE_SIZE;
2860 } 2870 }
2861 } 2871 }
2862 return (void *)addr; 2872 return (void *)addr;
2863 } 2873 }
2864 2874
2865 /** 2875 /**
2866 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2876 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2867 * @size: the number of bytes to allocate 2877 * @size: the number of bytes to allocate
2868 * @gfp_mask: GFP flags for the allocation 2878 * @gfp_mask: GFP flags for the allocation
2869 * 2879 *
2870 * This function is similar to alloc_pages(), except that it allocates the 2880 * This function is similar to alloc_pages(), except that it allocates the
2871 * minimum number of pages to satisfy the request. alloc_pages() can only 2881 * minimum number of pages to satisfy the request. alloc_pages() can only
2872 * allocate memory in power-of-two pages. 2882 * allocate memory in power-of-two pages.
2873 * 2883 *
2874 * This function is also limited by MAX_ORDER. 2884 * This function is also limited by MAX_ORDER.
2875 * 2885 *
2876 * Memory allocated by this function must be released by free_pages_exact(). 2886 * Memory allocated by this function must be released by free_pages_exact().
2877 */ 2887 */
2878 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2888 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2879 { 2889 {
2880 unsigned int order = get_order(size); 2890 unsigned int order = get_order(size);
2881 unsigned long addr; 2891 unsigned long addr;
2882 2892
2883 addr = __get_free_pages(gfp_mask, order); 2893 addr = __get_free_pages(gfp_mask, order);
2884 return make_alloc_exact(addr, order, size); 2894 return make_alloc_exact(addr, order, size);
2885 } 2895 }
2886 EXPORT_SYMBOL(alloc_pages_exact); 2896 EXPORT_SYMBOL(alloc_pages_exact);
2887 2897
2888 /** 2898 /**
2889 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2899 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2890 * pages on a node. 2900 * pages on a node.
2891 * @nid: the preferred node ID where memory should be allocated 2901 * @nid: the preferred node ID where memory should be allocated
2892 * @size: the number of bytes to allocate 2902 * @size: the number of bytes to allocate
2893 * @gfp_mask: GFP flags for the allocation 2903 * @gfp_mask: GFP flags for the allocation
2894 * 2904 *
2895 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2905 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2896 * back. 2906 * back.
2897 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2907 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2898 * but is not exact. 2908 * but is not exact.
2899 */ 2909 */
2900 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2910 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2901 { 2911 {
2902 unsigned order = get_order(size); 2912 unsigned order = get_order(size);
2903 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2913 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2904 if (!p) 2914 if (!p)
2905 return NULL; 2915 return NULL;
2906 return make_alloc_exact((unsigned long)page_address(p), order, size); 2916 return make_alloc_exact((unsigned long)page_address(p), order, size);
2907 } 2917 }
2908 EXPORT_SYMBOL(alloc_pages_exact_nid); 2918 EXPORT_SYMBOL(alloc_pages_exact_nid);
2909 2919
2910 /** 2920 /**
2911 * free_pages_exact - release memory allocated via alloc_pages_exact() 2921 * free_pages_exact - release memory allocated via alloc_pages_exact()
2912 * @virt: the value returned by alloc_pages_exact. 2922 * @virt: the value returned by alloc_pages_exact.
2913 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2923 * @size: size of allocation, same value as passed to alloc_pages_exact().
2914 * 2924 *
2915 * Release the memory allocated by a previous call to alloc_pages_exact. 2925 * Release the memory allocated by a previous call to alloc_pages_exact.
2916 */ 2926 */
2917 void free_pages_exact(void *virt, size_t size) 2927 void free_pages_exact(void *virt, size_t size)
2918 { 2928 {
2919 unsigned long addr = (unsigned long)virt; 2929 unsigned long addr = (unsigned long)virt;
2920 unsigned long end = addr + PAGE_ALIGN(size); 2930 unsigned long end = addr + PAGE_ALIGN(size);
2921 2931
2922 while (addr < end) { 2932 while (addr < end) {
2923 free_page(addr); 2933 free_page(addr);
2924 addr += PAGE_SIZE; 2934 addr += PAGE_SIZE;
2925 } 2935 }
2926 } 2936 }
2927 EXPORT_SYMBOL(free_pages_exact); 2937 EXPORT_SYMBOL(free_pages_exact);
2928 2938
2929 /** 2939 /**
2930 * nr_free_zone_pages - count number of pages beyond high watermark 2940 * nr_free_zone_pages - count number of pages beyond high watermark
2931 * @offset: The zone index of the highest zone 2941 * @offset: The zone index of the highest zone
2932 * 2942 *
2933 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2943 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2934 * high watermark within all zones at or below a given zone index. For each 2944 * high watermark within all zones at or below a given zone index. For each
2935 * zone, the number of pages is calculated as: 2945 * zone, the number of pages is calculated as:
2936 * managed_pages - high_pages 2946 * managed_pages - high_pages
2937 */ 2947 */
2938 static unsigned long nr_free_zone_pages(int offset) 2948 static unsigned long nr_free_zone_pages(int offset)
2939 { 2949 {
2940 struct zoneref *z; 2950 struct zoneref *z;
2941 struct zone *zone; 2951 struct zone *zone;
2942 2952
2943 /* Just pick one node, since fallback list is circular */ 2953 /* Just pick one node, since fallback list is circular */
2944 unsigned long sum = 0; 2954 unsigned long sum = 0;
2945 2955
2946 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2956 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2947 2957
2948 for_each_zone_zonelist(zone, z, zonelist, offset) { 2958 for_each_zone_zonelist(zone, z, zonelist, offset) {
2949 unsigned long size = zone->managed_pages; 2959 unsigned long size = zone->managed_pages;
2950 unsigned long high = high_wmark_pages(zone); 2960 unsigned long high = high_wmark_pages(zone);
2951 if (size > high) 2961 if (size > high)
2952 sum += size - high; 2962 sum += size - high;
2953 } 2963 }
2954 2964
2955 return sum; 2965 return sum;
2956 } 2966 }
2957 2967
2958 /** 2968 /**
2959 * nr_free_buffer_pages - count number of pages beyond high watermark 2969 * nr_free_buffer_pages - count number of pages beyond high watermark
2960 * 2970 *
2961 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2971 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2962 * watermark within ZONE_DMA and ZONE_NORMAL. 2972 * watermark within ZONE_DMA and ZONE_NORMAL.
2963 */ 2973 */
2964 unsigned long nr_free_buffer_pages(void) 2974 unsigned long nr_free_buffer_pages(void)
2965 { 2975 {
2966 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2976 return nr_free_zone_pages(gfp_zone(GFP_USER));
2967 } 2977 }
2968 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2978 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2969 2979
2970 /** 2980 /**
2971 * nr_free_pagecache_pages - count number of pages beyond high watermark 2981 * nr_free_pagecache_pages - count number of pages beyond high watermark
2972 * 2982 *
2973 * nr_free_pagecache_pages() counts the number of pages which are beyond the 2983 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2974 * high watermark within all zones. 2984 * high watermark within all zones.
2975 */ 2985 */
2976 unsigned long nr_free_pagecache_pages(void) 2986 unsigned long nr_free_pagecache_pages(void)
2977 { 2987 {
2978 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2988 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2979 } 2989 }
2980 2990
2981 static inline void show_node(struct zone *zone) 2991 static inline void show_node(struct zone *zone)
2982 { 2992 {
2983 if (IS_ENABLED(CONFIG_NUMA)) 2993 if (IS_ENABLED(CONFIG_NUMA))
2984 printk("Node %d ", zone_to_nid(zone)); 2994 printk("Node %d ", zone_to_nid(zone));
2985 } 2995 }
2986 2996
2987 void si_meminfo(struct sysinfo *val) 2997 void si_meminfo(struct sysinfo *val)
2988 { 2998 {
2989 val->totalram = totalram_pages; 2999 val->totalram = totalram_pages;
2990 val->sharedram = 0; 3000 val->sharedram = 0;
2991 val->freeram = global_page_state(NR_FREE_PAGES); 3001 val->freeram = global_page_state(NR_FREE_PAGES);
2992 val->bufferram = nr_blockdev_pages(); 3002 val->bufferram = nr_blockdev_pages();
2993 val->totalhigh = totalhigh_pages; 3003 val->totalhigh = totalhigh_pages;
2994 val->freehigh = nr_free_highpages(); 3004 val->freehigh = nr_free_highpages();
2995 val->mem_unit = PAGE_SIZE; 3005 val->mem_unit = PAGE_SIZE;
2996 } 3006 }
2997 3007
2998 EXPORT_SYMBOL(si_meminfo); 3008 EXPORT_SYMBOL(si_meminfo);
2999 3009
3000 #ifdef CONFIG_NUMA 3010 #ifdef CONFIG_NUMA
3001 void si_meminfo_node(struct sysinfo *val, int nid) 3011 void si_meminfo_node(struct sysinfo *val, int nid)
3002 { 3012 {
3003 int zone_type; /* needs to be signed */ 3013 int zone_type; /* needs to be signed */
3004 unsigned long managed_pages = 0; 3014 unsigned long managed_pages = 0;
3005 pg_data_t *pgdat = NODE_DATA(nid); 3015 pg_data_t *pgdat = NODE_DATA(nid);
3006 3016
3007 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3017 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3008 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3018 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3009 val->totalram = managed_pages; 3019 val->totalram = managed_pages;
3010 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3020 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3011 #ifdef CONFIG_HIGHMEM 3021 #ifdef CONFIG_HIGHMEM
3012 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3022 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
3013 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3023 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3014 NR_FREE_PAGES); 3024 NR_FREE_PAGES);
3015 #else 3025 #else
3016 val->totalhigh = 0; 3026 val->totalhigh = 0;
3017 val->freehigh = 0; 3027 val->freehigh = 0;
3018 #endif 3028 #endif
3019 val->mem_unit = PAGE_SIZE; 3029 val->mem_unit = PAGE_SIZE;
3020 } 3030 }
3021 #endif 3031 #endif
3022 3032
3023 /* 3033 /*
3024 * Determine whether the node should be displayed or not, depending on whether 3034 * Determine whether the node should be displayed or not, depending on whether
3025 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3035 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3026 */ 3036 */
3027 bool skip_free_areas_node(unsigned int flags, int nid) 3037 bool skip_free_areas_node(unsigned int flags, int nid)
3028 { 3038 {
3029 bool ret = false; 3039 bool ret = false;
3030 unsigned int cpuset_mems_cookie; 3040 unsigned int cpuset_mems_cookie;
3031 3041
3032 if (!(flags & SHOW_MEM_FILTER_NODES)) 3042 if (!(flags & SHOW_MEM_FILTER_NODES))
3033 goto out; 3043 goto out;
3034 3044
3035 do { 3045 do {
3036 cpuset_mems_cookie = read_mems_allowed_begin(); 3046 cpuset_mems_cookie = read_mems_allowed_begin();
3037 ret = !node_isset(nid, cpuset_current_mems_allowed); 3047 ret = !node_isset(nid, cpuset_current_mems_allowed);
3038 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3048 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3039 out: 3049 out:
3040 return ret; 3050 return ret;
3041 } 3051 }
3042 3052
3043 #define K(x) ((x) << (PAGE_SHIFT-10)) 3053 #define K(x) ((x) << (PAGE_SHIFT-10))
3044 3054
3045 static void show_migration_types(unsigned char type) 3055 static void show_migration_types(unsigned char type)
3046 { 3056 {
3047 static const char types[MIGRATE_TYPES] = { 3057 static const char types[MIGRATE_TYPES] = {
3048 [MIGRATE_UNMOVABLE] = 'U', 3058 [MIGRATE_UNMOVABLE] = 'U',
3049 [MIGRATE_RECLAIMABLE] = 'E', 3059 [MIGRATE_RECLAIMABLE] = 'E',
3050 [MIGRATE_MOVABLE] = 'M', 3060 [MIGRATE_MOVABLE] = 'M',
3051 [MIGRATE_RESERVE] = 'R', 3061 [MIGRATE_RESERVE] = 'R',
3052 #ifdef CONFIG_CMA 3062 #ifdef CONFIG_CMA
3053 [MIGRATE_CMA] = 'C', 3063 [MIGRATE_CMA] = 'C',
3054 #endif 3064 #endif
3055 #ifdef CONFIG_MEMORY_ISOLATION 3065 #ifdef CONFIG_MEMORY_ISOLATION
3056 [MIGRATE_ISOLATE] = 'I', 3066 [MIGRATE_ISOLATE] = 'I',
3057 #endif 3067 #endif
3058 }; 3068 };
3059 char tmp[MIGRATE_TYPES + 1]; 3069 char tmp[MIGRATE_TYPES + 1];
3060 char *p = tmp; 3070 char *p = tmp;
3061 int i; 3071 int i;
3062 3072
3063 for (i = 0; i < MIGRATE_TYPES; i++) { 3073 for (i = 0; i < MIGRATE_TYPES; i++) {
3064 if (type & (1 << i)) 3074 if (type & (1 << i))
3065 *p++ = types[i]; 3075 *p++ = types[i];
3066 } 3076 }
3067 3077
3068 *p = '\0'; 3078 *p = '\0';
3069 printk("(%s) ", tmp); 3079 printk("(%s) ", tmp);
3070 } 3080 }
3071 3081
3072 /* 3082 /*
3073 * Show free area list (used inside shift_scroll-lock stuff) 3083 * Show free area list (used inside shift_scroll-lock stuff)
3074 * We also calculate the percentage fragmentation. We do this by counting the 3084 * We also calculate the percentage fragmentation. We do this by counting the
3075 * memory on each free list with the exception of the first item on the list. 3085 * memory on each free list with the exception of the first item on the list.
3076 * Suppresses nodes that are not allowed by current's cpuset if 3086 * Suppresses nodes that are not allowed by current's cpuset if
3077 * SHOW_MEM_FILTER_NODES is passed. 3087 * SHOW_MEM_FILTER_NODES is passed.
3078 */ 3088 */
3079 void show_free_areas(unsigned int filter) 3089 void show_free_areas(unsigned int filter)
3080 { 3090 {
3081 int cpu; 3091 int cpu;
3082 struct zone *zone; 3092 struct zone *zone;
3083 3093
3084 for_each_populated_zone(zone) { 3094 for_each_populated_zone(zone) {
3085 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3095 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3086 continue; 3096 continue;
3087 show_node(zone); 3097 show_node(zone);
3088 printk("%s per-cpu:\n", zone->name); 3098 printk("%s per-cpu:\n", zone->name);
3089 3099
3090 for_each_online_cpu(cpu) { 3100 for_each_online_cpu(cpu) {
3091 struct per_cpu_pageset *pageset; 3101 struct per_cpu_pageset *pageset;
3092 3102
3093 pageset = per_cpu_ptr(zone->pageset, cpu); 3103 pageset = per_cpu_ptr(zone->pageset, cpu);
3094 3104
3095 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3105 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3096 cpu, pageset->pcp.high, 3106 cpu, pageset->pcp.high,
3097 pageset->pcp.batch, pageset->pcp.count); 3107 pageset->pcp.batch, pageset->pcp.count);
3098 } 3108 }
3099 } 3109 }
3100 3110
3101 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3111 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3102 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3112 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3103 " unevictable:%lu" 3113 " unevictable:%lu"
3104 " dirty:%lu writeback:%lu unstable:%lu\n" 3114 " dirty:%lu writeback:%lu unstable:%lu\n"
3105 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3115 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3106 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3116 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3107 " free_cma:%lu\n", 3117 " free_cma:%lu\n",
3108 global_page_state(NR_ACTIVE_ANON), 3118 global_page_state(NR_ACTIVE_ANON),
3109 global_page_state(NR_INACTIVE_ANON), 3119 global_page_state(NR_INACTIVE_ANON),
3110 global_page_state(NR_ISOLATED_ANON), 3120 global_page_state(NR_ISOLATED_ANON),
3111 global_page_state(NR_ACTIVE_FILE), 3121 global_page_state(NR_ACTIVE_FILE),
3112 global_page_state(NR_INACTIVE_FILE), 3122 global_page_state(NR_INACTIVE_FILE),
3113 global_page_state(NR_ISOLATED_FILE), 3123 global_page_state(NR_ISOLATED_FILE),
3114 global_page_state(NR_UNEVICTABLE), 3124 global_page_state(NR_UNEVICTABLE),
3115 global_page_state(NR_FILE_DIRTY), 3125 global_page_state(NR_FILE_DIRTY),
3116 global_page_state(NR_WRITEBACK), 3126 global_page_state(NR_WRITEBACK),
3117 global_page_state(NR_UNSTABLE_NFS), 3127 global_page_state(NR_UNSTABLE_NFS),
3118 global_page_state(NR_FREE_PAGES), 3128 global_page_state(NR_FREE_PAGES),
3119 global_page_state(NR_SLAB_RECLAIMABLE), 3129 global_page_state(NR_SLAB_RECLAIMABLE),
3120 global_page_state(NR_SLAB_UNRECLAIMABLE), 3130 global_page_state(NR_SLAB_UNRECLAIMABLE),
3121 global_page_state(NR_FILE_MAPPED), 3131 global_page_state(NR_FILE_MAPPED),
3122 global_page_state(NR_SHMEM), 3132 global_page_state(NR_SHMEM),
3123 global_page_state(NR_PAGETABLE), 3133 global_page_state(NR_PAGETABLE),
3124 global_page_state(NR_BOUNCE), 3134 global_page_state(NR_BOUNCE),
3125 global_page_state(NR_FREE_CMA_PAGES)); 3135 global_page_state(NR_FREE_CMA_PAGES));
3126 3136
3127 for_each_populated_zone(zone) { 3137 for_each_populated_zone(zone) {
3128 int i; 3138 int i;
3129 3139
3130 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3140 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3131 continue; 3141 continue;
3132 show_node(zone); 3142 show_node(zone);
3133 printk("%s" 3143 printk("%s"
3134 " free:%lukB" 3144 " free:%lukB"
3135 " min:%lukB" 3145 " min:%lukB"
3136 " low:%lukB" 3146 " low:%lukB"
3137 " high:%lukB" 3147 " high:%lukB"
3138 " active_anon:%lukB" 3148 " active_anon:%lukB"
3139 " inactive_anon:%lukB" 3149 " inactive_anon:%lukB"
3140 " active_file:%lukB" 3150 " active_file:%lukB"
3141 " inactive_file:%lukB" 3151 " inactive_file:%lukB"
3142 " unevictable:%lukB" 3152 " unevictable:%lukB"
3143 " isolated(anon):%lukB" 3153 " isolated(anon):%lukB"
3144 " isolated(file):%lukB" 3154 " isolated(file):%lukB"
3145 " present:%lukB" 3155 " present:%lukB"
3146 " managed:%lukB" 3156 " managed:%lukB"
3147 " mlocked:%lukB" 3157 " mlocked:%lukB"
3148 " dirty:%lukB" 3158 " dirty:%lukB"
3149 " writeback:%lukB" 3159 " writeback:%lukB"
3150 " mapped:%lukB" 3160 " mapped:%lukB"
3151 " shmem:%lukB" 3161 " shmem:%lukB"
3152 " slab_reclaimable:%lukB" 3162 " slab_reclaimable:%lukB"
3153 " slab_unreclaimable:%lukB" 3163 " slab_unreclaimable:%lukB"
3154 " kernel_stack:%lukB" 3164 " kernel_stack:%lukB"
3155 " pagetables:%lukB" 3165 " pagetables:%lukB"
3156 " unstable:%lukB" 3166 " unstable:%lukB"
3157 " bounce:%lukB" 3167 " bounce:%lukB"
3158 " free_cma:%lukB" 3168 " free_cma:%lukB"
3159 " writeback_tmp:%lukB" 3169 " writeback_tmp:%lukB"
3160 " pages_scanned:%lu" 3170 " pages_scanned:%lu"
3161 " all_unreclaimable? %s" 3171 " all_unreclaimable? %s"
3162 "\n", 3172 "\n",
3163 zone->name, 3173 zone->name,
3164 K(zone_page_state(zone, NR_FREE_PAGES)), 3174 K(zone_page_state(zone, NR_FREE_PAGES)),
3165 K(min_wmark_pages(zone)), 3175 K(min_wmark_pages(zone)),
3166 K(low_wmark_pages(zone)), 3176 K(low_wmark_pages(zone)),
3167 K(high_wmark_pages(zone)), 3177 K(high_wmark_pages(zone)),
3168 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3178 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3169 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3179 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3170 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3180 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3171 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3181 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3172 K(zone_page_state(zone, NR_UNEVICTABLE)), 3182 K(zone_page_state(zone, NR_UNEVICTABLE)),
3173 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3183 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3174 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3184 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3175 K(zone->present_pages), 3185 K(zone->present_pages),
3176 K(zone->managed_pages), 3186 K(zone->managed_pages),
3177 K(zone_page_state(zone, NR_MLOCK)), 3187 K(zone_page_state(zone, NR_MLOCK)),
3178 K(zone_page_state(zone, NR_FILE_DIRTY)), 3188 K(zone_page_state(zone, NR_FILE_DIRTY)),
3179 K(zone_page_state(zone, NR_WRITEBACK)), 3189 K(zone_page_state(zone, NR_WRITEBACK)),
3180 K(zone_page_state(zone, NR_FILE_MAPPED)), 3190 K(zone_page_state(zone, NR_FILE_MAPPED)),
3181 K(zone_page_state(zone, NR_SHMEM)), 3191 K(zone_page_state(zone, NR_SHMEM)),
3182 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3192 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3183 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3193 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3184 zone_page_state(zone, NR_KERNEL_STACK) * 3194 zone_page_state(zone, NR_KERNEL_STACK) *
3185 THREAD_SIZE / 1024, 3195 THREAD_SIZE / 1024,
3186 K(zone_page_state(zone, NR_PAGETABLE)), 3196 K(zone_page_state(zone, NR_PAGETABLE)),
3187 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3197 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3188 K(zone_page_state(zone, NR_BOUNCE)), 3198 K(zone_page_state(zone, NR_BOUNCE)),
3189 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3199 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3190 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3200 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3191 zone->pages_scanned, 3201 zone->pages_scanned,
3192 (!zone_reclaimable(zone) ? "yes" : "no") 3202 (!zone_reclaimable(zone) ? "yes" : "no")
3193 ); 3203 );
3194 printk("lowmem_reserve[]:"); 3204 printk("lowmem_reserve[]:");
3195 for (i = 0; i < MAX_NR_ZONES; i++) 3205 for (i = 0; i < MAX_NR_ZONES; i++)
3196 printk(" %lu", zone->lowmem_reserve[i]); 3206 printk(" %lu", zone->lowmem_reserve[i]);
3197 printk("\n"); 3207 printk("\n");
3198 } 3208 }
3199 3209
3200 for_each_populated_zone(zone) { 3210 for_each_populated_zone(zone) {
3201 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3211 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3202 unsigned char types[MAX_ORDER]; 3212 unsigned char types[MAX_ORDER];
3203 3213
3204 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3214 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3205 continue; 3215 continue;
3206 show_node(zone); 3216 show_node(zone);
3207 printk("%s: ", zone->name); 3217 printk("%s: ", zone->name);
3208 3218
3209 spin_lock_irqsave(&zone->lock, flags); 3219 spin_lock_irqsave(&zone->lock, flags);
3210 for (order = 0; order < MAX_ORDER; order++) { 3220 for (order = 0; order < MAX_ORDER; order++) {
3211 struct free_area *area = &zone->free_area[order]; 3221 struct free_area *area = &zone->free_area[order];
3212 int type; 3222 int type;
3213 3223
3214 nr[order] = area->nr_free; 3224 nr[order] = area->nr_free;
3215 total += nr[order] << order; 3225 total += nr[order] << order;
3216 3226
3217 types[order] = 0; 3227 types[order] = 0;
3218 for (type = 0; type < MIGRATE_TYPES; type++) { 3228 for (type = 0; type < MIGRATE_TYPES; type++) {
3219 if (!list_empty(&area->free_list[type])) 3229 if (!list_empty(&area->free_list[type]))
3220 types[order] |= 1 << type; 3230 types[order] |= 1 << type;
3221 } 3231 }
3222 } 3232 }
3223 spin_unlock_irqrestore(&zone->lock, flags); 3233 spin_unlock_irqrestore(&zone->lock, flags);
3224 for (order = 0; order < MAX_ORDER; order++) { 3234 for (order = 0; order < MAX_ORDER; order++) {
3225 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3235 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3226 if (nr[order]) 3236 if (nr[order])
3227 show_migration_types(types[order]); 3237 show_migration_types(types[order]);
3228 } 3238 }
3229 printk("= %lukB\n", K(total)); 3239 printk("= %lukB\n", K(total));
3230 } 3240 }
3231 3241
3232 hugetlb_show_meminfo(); 3242 hugetlb_show_meminfo();
3233 3243
3234 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3244 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3235 3245
3236 show_swap_cache_info(); 3246 show_swap_cache_info();
3237 } 3247 }
3238 3248
3239 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3249 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3240 { 3250 {
3241 zoneref->zone = zone; 3251 zoneref->zone = zone;
3242 zoneref->zone_idx = zone_idx(zone); 3252 zoneref->zone_idx = zone_idx(zone);
3243 } 3253 }
3244 3254
3245 /* 3255 /*
3246 * Builds allocation fallback zone lists. 3256 * Builds allocation fallback zone lists.
3247 * 3257 *
3248 * Add all populated zones of a node to the zonelist. 3258 * Add all populated zones of a node to the zonelist.
3249 */ 3259 */
3250 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3260 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3251 int nr_zones) 3261 int nr_zones)
3252 { 3262 {
3253 struct zone *zone; 3263 struct zone *zone;
3254 enum zone_type zone_type = MAX_NR_ZONES; 3264 enum zone_type zone_type = MAX_NR_ZONES;
3255 3265
3256 do { 3266 do {
3257 zone_type--; 3267 zone_type--;
3258 zone = pgdat->node_zones + zone_type; 3268 zone = pgdat->node_zones + zone_type;
3259 if (populated_zone(zone)) { 3269 if (populated_zone(zone)) {
3260 zoneref_set_zone(zone, 3270 zoneref_set_zone(zone,
3261 &zonelist->_zonerefs[nr_zones++]); 3271 &zonelist->_zonerefs[nr_zones++]);
3262 check_highest_zone(zone_type); 3272 check_highest_zone(zone_type);
3263 } 3273 }
3264 } while (zone_type); 3274 } while (zone_type);
3265 3275
3266 return nr_zones; 3276 return nr_zones;
3267 } 3277 }
3268 3278
3269 3279
3270 /* 3280 /*
3271 * zonelist_order: 3281 * zonelist_order:
3272 * 0 = automatic detection of better ordering. 3282 * 0 = automatic detection of better ordering.
3273 * 1 = order by ([node] distance, -zonetype) 3283 * 1 = order by ([node] distance, -zonetype)
3274 * 2 = order by (-zonetype, [node] distance) 3284 * 2 = order by (-zonetype, [node] distance)
3275 * 3285 *
3276 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3286 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3277 * the same zonelist. So only NUMA can configure this param. 3287 * the same zonelist. So only NUMA can configure this param.
3278 */ 3288 */
3279 #define ZONELIST_ORDER_DEFAULT 0 3289 #define ZONELIST_ORDER_DEFAULT 0
3280 #define ZONELIST_ORDER_NODE 1 3290 #define ZONELIST_ORDER_NODE 1
3281 #define ZONELIST_ORDER_ZONE 2 3291 #define ZONELIST_ORDER_ZONE 2
3282 3292
3283 /* zonelist order in the kernel. 3293 /* zonelist order in the kernel.
3284 * set_zonelist_order() will set this to NODE or ZONE. 3294 * set_zonelist_order() will set this to NODE or ZONE.
3285 */ 3295 */
3286 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3296 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3287 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3297 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3288 3298
3289 3299
3290 #ifdef CONFIG_NUMA 3300 #ifdef CONFIG_NUMA
3291 /* The value user specified ....changed by config */ 3301 /* The value user specified ....changed by config */
3292 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3302 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3293 /* string for sysctl */ 3303 /* string for sysctl */
3294 #define NUMA_ZONELIST_ORDER_LEN 16 3304 #define NUMA_ZONELIST_ORDER_LEN 16
3295 char numa_zonelist_order[16] = "default"; 3305 char numa_zonelist_order[16] = "default";
3296 3306
3297 /* 3307 /*
3298 * interface for configure zonelist ordering. 3308 * interface for configure zonelist ordering.
3299 * command line option "numa_zonelist_order" 3309 * command line option "numa_zonelist_order"
3300 * = "[dD]efault - default, automatic configuration. 3310 * = "[dD]efault - default, automatic configuration.
3301 * = "[nN]ode - order by node locality, then by zone within node 3311 * = "[nN]ode - order by node locality, then by zone within node
3302 * = "[zZ]one - order by zone, then by locality within zone 3312 * = "[zZ]one - order by zone, then by locality within zone
3303 */ 3313 */
3304 3314
3305 static int __parse_numa_zonelist_order(char *s) 3315 static int __parse_numa_zonelist_order(char *s)
3306 { 3316 {
3307 if (*s == 'd' || *s == 'D') { 3317 if (*s == 'd' || *s == 'D') {
3308 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3318 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3309 } else if (*s == 'n' || *s == 'N') { 3319 } else if (*s == 'n' || *s == 'N') {
3310 user_zonelist_order = ZONELIST_ORDER_NODE; 3320 user_zonelist_order = ZONELIST_ORDER_NODE;
3311 } else if (*s == 'z' || *s == 'Z') { 3321 } else if (*s == 'z' || *s == 'Z') {
3312 user_zonelist_order = ZONELIST_ORDER_ZONE; 3322 user_zonelist_order = ZONELIST_ORDER_ZONE;
3313 } else { 3323 } else {
3314 printk(KERN_WARNING 3324 printk(KERN_WARNING
3315 "Ignoring invalid numa_zonelist_order value: " 3325 "Ignoring invalid numa_zonelist_order value: "
3316 "%s\n", s); 3326 "%s\n", s);
3317 return -EINVAL; 3327 return -EINVAL;
3318 } 3328 }
3319 return 0; 3329 return 0;
3320 } 3330 }
3321 3331
3322 static __init int setup_numa_zonelist_order(char *s) 3332 static __init int setup_numa_zonelist_order(char *s)
3323 { 3333 {
3324 int ret; 3334 int ret;
3325 3335
3326 if (!s) 3336 if (!s)
3327 return 0; 3337 return 0;
3328 3338
3329 ret = __parse_numa_zonelist_order(s); 3339 ret = __parse_numa_zonelist_order(s);
3330 if (ret == 0) 3340 if (ret == 0)
3331 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3341 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3332 3342
3333 return ret; 3343 return ret;
3334 } 3344 }
3335 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3345 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3336 3346
3337 /* 3347 /*
3338 * sysctl handler for numa_zonelist_order 3348 * sysctl handler for numa_zonelist_order
3339 */ 3349 */
3340 int numa_zonelist_order_handler(ctl_table *table, int write, 3350 int numa_zonelist_order_handler(ctl_table *table, int write,
3341 void __user *buffer, size_t *length, 3351 void __user *buffer, size_t *length,
3342 loff_t *ppos) 3352 loff_t *ppos)
3343 { 3353 {
3344 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3354 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3345 int ret; 3355 int ret;
3346 static DEFINE_MUTEX(zl_order_mutex); 3356 static DEFINE_MUTEX(zl_order_mutex);
3347 3357
3348 mutex_lock(&zl_order_mutex); 3358 mutex_lock(&zl_order_mutex);
3349 if (write) { 3359 if (write) {
3350 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3360 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3351 ret = -EINVAL; 3361 ret = -EINVAL;
3352 goto out; 3362 goto out;
3353 } 3363 }
3354 strcpy(saved_string, (char *)table->data); 3364 strcpy(saved_string, (char *)table->data);
3355 } 3365 }
3356 ret = proc_dostring(table, write, buffer, length, ppos); 3366 ret = proc_dostring(table, write, buffer, length, ppos);
3357 if (ret) 3367 if (ret)
3358 goto out; 3368 goto out;
3359 if (write) { 3369 if (write) {
3360 int oldval = user_zonelist_order; 3370 int oldval = user_zonelist_order;
3361 3371
3362 ret = __parse_numa_zonelist_order((char *)table->data); 3372 ret = __parse_numa_zonelist_order((char *)table->data);
3363 if (ret) { 3373 if (ret) {
3364 /* 3374 /*
3365 * bogus value. restore saved string 3375 * bogus value. restore saved string
3366 */ 3376 */
3367 strncpy((char *)table->data, saved_string, 3377 strncpy((char *)table->data, saved_string,
3368 NUMA_ZONELIST_ORDER_LEN); 3378 NUMA_ZONELIST_ORDER_LEN);
3369 user_zonelist_order = oldval; 3379 user_zonelist_order = oldval;
3370 } else if (oldval != user_zonelist_order) { 3380 } else if (oldval != user_zonelist_order) {
3371 mutex_lock(&zonelists_mutex); 3381 mutex_lock(&zonelists_mutex);
3372 build_all_zonelists(NULL, NULL); 3382 build_all_zonelists(NULL, NULL);
3373 mutex_unlock(&zonelists_mutex); 3383 mutex_unlock(&zonelists_mutex);
3374 } 3384 }
3375 } 3385 }
3376 out: 3386 out:
3377 mutex_unlock(&zl_order_mutex); 3387 mutex_unlock(&zl_order_mutex);
3378 return ret; 3388 return ret;
3379 } 3389 }
3380 3390
3381 3391
3382 #define MAX_NODE_LOAD (nr_online_nodes) 3392 #define MAX_NODE_LOAD (nr_online_nodes)
3383 static int node_load[MAX_NUMNODES]; 3393 static int node_load[MAX_NUMNODES];
3384 3394
3385 /** 3395 /**
3386 * find_next_best_node - find the next node that should appear in a given node's fallback list 3396 * find_next_best_node - find the next node that should appear in a given node's fallback list
3387 * @node: node whose fallback list we're appending 3397 * @node: node whose fallback list we're appending
3388 * @used_node_mask: nodemask_t of already used nodes 3398 * @used_node_mask: nodemask_t of already used nodes
3389 * 3399 *
3390 * We use a number of factors to determine which is the next node that should 3400 * We use a number of factors to determine which is the next node that should
3391 * appear on a given node's fallback list. The node should not have appeared 3401 * appear on a given node's fallback list. The node should not have appeared
3392 * already in @node's fallback list, and it should be the next closest node 3402 * already in @node's fallback list, and it should be the next closest node
3393 * according to the distance array (which contains arbitrary distance values 3403 * according to the distance array (which contains arbitrary distance values
3394 * from each node to each node in the system), and should also prefer nodes 3404 * from each node to each node in the system), and should also prefer nodes
3395 * with no CPUs, since presumably they'll have very little allocation pressure 3405 * with no CPUs, since presumably they'll have very little allocation pressure
3396 * on them otherwise. 3406 * on them otherwise.
3397 * It returns -1 if no node is found. 3407 * It returns -1 if no node is found.
3398 */ 3408 */
3399 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3409 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3400 { 3410 {
3401 int n, val; 3411 int n, val;
3402 int min_val = INT_MAX; 3412 int min_val = INT_MAX;
3403 int best_node = NUMA_NO_NODE; 3413 int best_node = NUMA_NO_NODE;
3404 const struct cpumask *tmp = cpumask_of_node(0); 3414 const struct cpumask *tmp = cpumask_of_node(0);
3405 3415
3406 /* Use the local node if we haven't already */ 3416 /* Use the local node if we haven't already */
3407 if (!node_isset(node, *used_node_mask)) { 3417 if (!node_isset(node, *used_node_mask)) {
3408 node_set(node, *used_node_mask); 3418 node_set(node, *used_node_mask);
3409 return node; 3419 return node;
3410 } 3420 }
3411 3421
3412 for_each_node_state(n, N_MEMORY) { 3422 for_each_node_state(n, N_MEMORY) {
3413 3423
3414 /* Don't want a node to appear more than once */ 3424 /* Don't want a node to appear more than once */
3415 if (node_isset(n, *used_node_mask)) 3425 if (node_isset(n, *used_node_mask))
3416 continue; 3426 continue;
3417 3427
3418 /* Use the distance array to find the distance */ 3428 /* Use the distance array to find the distance */
3419 val = node_distance(node, n); 3429 val = node_distance(node, n);
3420 3430
3421 /* Penalize nodes under us ("prefer the next node") */ 3431 /* Penalize nodes under us ("prefer the next node") */
3422 val += (n < node); 3432 val += (n < node);
3423 3433
3424 /* Give preference to headless and unused nodes */ 3434 /* Give preference to headless and unused nodes */
3425 tmp = cpumask_of_node(n); 3435 tmp = cpumask_of_node(n);
3426 if (!cpumask_empty(tmp)) 3436 if (!cpumask_empty(tmp))
3427 val += PENALTY_FOR_NODE_WITH_CPUS; 3437 val += PENALTY_FOR_NODE_WITH_CPUS;
3428 3438
3429 /* Slight preference for less loaded node */ 3439 /* Slight preference for less loaded node */
3430 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3440 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3431 val += node_load[n]; 3441 val += node_load[n];
3432 3442
3433 if (val < min_val) { 3443 if (val < min_val) {
3434 min_val = val; 3444 min_val = val;
3435 best_node = n; 3445 best_node = n;
3436 } 3446 }
3437 } 3447 }
3438 3448
3439 if (best_node >= 0) 3449 if (best_node >= 0)
3440 node_set(best_node, *used_node_mask); 3450 node_set(best_node, *used_node_mask);
3441 3451
3442 return best_node; 3452 return best_node;
3443 } 3453 }
3444 3454
3445 3455
3446 /* 3456 /*
3447 * Build zonelists ordered by node and zones within node. 3457 * Build zonelists ordered by node and zones within node.
3448 * This results in maximum locality--normal zone overflows into local 3458 * This results in maximum locality--normal zone overflows into local
3449 * DMA zone, if any--but risks exhausting DMA zone. 3459 * DMA zone, if any--but risks exhausting DMA zone.
3450 */ 3460 */
3451 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3461 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3452 { 3462 {
3453 int j; 3463 int j;
3454 struct zonelist *zonelist; 3464 struct zonelist *zonelist;
3455 3465
3456 zonelist = &pgdat->node_zonelists[0]; 3466 zonelist = &pgdat->node_zonelists[0];
3457 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3467 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3458 ; 3468 ;
3459 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3469 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3460 zonelist->_zonerefs[j].zone = NULL; 3470 zonelist->_zonerefs[j].zone = NULL;
3461 zonelist->_zonerefs[j].zone_idx = 0; 3471 zonelist->_zonerefs[j].zone_idx = 0;
3462 } 3472 }
3463 3473
3464 /* 3474 /*
3465 * Build gfp_thisnode zonelists 3475 * Build gfp_thisnode zonelists
3466 */ 3476 */
3467 static void build_thisnode_zonelists(pg_data_t *pgdat) 3477 static void build_thisnode_zonelists(pg_data_t *pgdat)
3468 { 3478 {
3469 int j; 3479 int j;
3470 struct zonelist *zonelist; 3480 struct zonelist *zonelist;
3471 3481
3472 zonelist = &pgdat->node_zonelists[1]; 3482 zonelist = &pgdat->node_zonelists[1];
3473 j = build_zonelists_node(pgdat, zonelist, 0); 3483 j = build_zonelists_node(pgdat, zonelist, 0);
3474 zonelist->_zonerefs[j].zone = NULL; 3484 zonelist->_zonerefs[j].zone = NULL;
3475 zonelist->_zonerefs[j].zone_idx = 0; 3485 zonelist->_zonerefs[j].zone_idx = 0;
3476 } 3486 }
3477 3487
3478 /* 3488 /*
3479 * Build zonelists ordered by zone and nodes within zones. 3489 * Build zonelists ordered by zone and nodes within zones.
3480 * This results in conserving DMA zone[s] until all Normal memory is 3490 * This results in conserving DMA zone[s] until all Normal memory is
3481 * exhausted, but results in overflowing to remote node while memory 3491 * exhausted, but results in overflowing to remote node while memory
3482 * may still exist in local DMA zone. 3492 * may still exist in local DMA zone.
3483 */ 3493 */
3484 static int node_order[MAX_NUMNODES]; 3494 static int node_order[MAX_NUMNODES];
3485 3495
3486 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3496 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3487 { 3497 {
3488 int pos, j, node; 3498 int pos, j, node;
3489 int zone_type; /* needs to be signed */ 3499 int zone_type; /* needs to be signed */
3490 struct zone *z; 3500 struct zone *z;
3491 struct zonelist *zonelist; 3501 struct zonelist *zonelist;
3492 3502
3493 zonelist = &pgdat->node_zonelists[0]; 3503 zonelist = &pgdat->node_zonelists[0];
3494 pos = 0; 3504 pos = 0;
3495 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3505 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3496 for (j = 0; j < nr_nodes; j++) { 3506 for (j = 0; j < nr_nodes; j++) {
3497 node = node_order[j]; 3507 node = node_order[j];
3498 z = &NODE_DATA(node)->node_zones[zone_type]; 3508 z = &NODE_DATA(node)->node_zones[zone_type];
3499 if (populated_zone(z)) { 3509 if (populated_zone(z)) {
3500 zoneref_set_zone(z, 3510 zoneref_set_zone(z,
3501 &zonelist->_zonerefs[pos++]); 3511 &zonelist->_zonerefs[pos++]);
3502 check_highest_zone(zone_type); 3512 check_highest_zone(zone_type);
3503 } 3513 }
3504 } 3514 }
3505 } 3515 }
3506 zonelist->_zonerefs[pos].zone = NULL; 3516 zonelist->_zonerefs[pos].zone = NULL;
3507 zonelist->_zonerefs[pos].zone_idx = 0; 3517 zonelist->_zonerefs[pos].zone_idx = 0;
3508 } 3518 }
3509 3519
3510 static int default_zonelist_order(void) 3520 static int default_zonelist_order(void)
3511 { 3521 {
3512 int nid, zone_type; 3522 int nid, zone_type;
3513 unsigned long low_kmem_size, total_size; 3523 unsigned long low_kmem_size, total_size;
3514 struct zone *z; 3524 struct zone *z;
3515 int average_size; 3525 int average_size;
3516 /* 3526 /*
3517 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3527 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3518 * If they are really small and used heavily, the system can fall 3528 * If they are really small and used heavily, the system can fall
3519 * into OOM very easily. 3529 * into OOM very easily.
3520 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3530 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3521 */ 3531 */
3522 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3532 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3523 low_kmem_size = 0; 3533 low_kmem_size = 0;
3524 total_size = 0; 3534 total_size = 0;
3525 for_each_online_node(nid) { 3535 for_each_online_node(nid) {
3526 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3536 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3527 z = &NODE_DATA(nid)->node_zones[zone_type]; 3537 z = &NODE_DATA(nid)->node_zones[zone_type];
3528 if (populated_zone(z)) { 3538 if (populated_zone(z)) {
3529 if (zone_type < ZONE_NORMAL) 3539 if (zone_type < ZONE_NORMAL)
3530 low_kmem_size += z->managed_pages; 3540 low_kmem_size += z->managed_pages;
3531 total_size += z->managed_pages; 3541 total_size += z->managed_pages;
3532 } else if (zone_type == ZONE_NORMAL) { 3542 } else if (zone_type == ZONE_NORMAL) {
3533 /* 3543 /*
3534 * If any node has only lowmem, then node order 3544 * If any node has only lowmem, then node order
3535 * is preferred to allow kernel allocations 3545 * is preferred to allow kernel allocations
3536 * locally; otherwise, they can easily infringe 3546 * locally; otherwise, they can easily infringe
3537 * on other nodes when there is an abundance of 3547 * on other nodes when there is an abundance of
3538 * lowmem available to allocate from. 3548 * lowmem available to allocate from.
3539 */ 3549 */
3540 return ZONELIST_ORDER_NODE; 3550 return ZONELIST_ORDER_NODE;
3541 } 3551 }
3542 } 3552 }
3543 } 3553 }
3544 if (!low_kmem_size || /* there are no DMA area. */ 3554 if (!low_kmem_size || /* there are no DMA area. */
3545 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3555 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3546 return ZONELIST_ORDER_NODE; 3556 return ZONELIST_ORDER_NODE;
3547 /* 3557 /*
3548 * look into each node's config. 3558 * look into each node's config.
3549 * If there is a node whose DMA/DMA32 memory is very big area on 3559 * If there is a node whose DMA/DMA32 memory is very big area on
3550 * local memory, NODE_ORDER may be suitable. 3560 * local memory, NODE_ORDER may be suitable.
3551 */ 3561 */
3552 average_size = total_size / 3562 average_size = total_size /
3553 (nodes_weight(node_states[N_MEMORY]) + 1); 3563 (nodes_weight(node_states[N_MEMORY]) + 1);
3554 for_each_online_node(nid) { 3564 for_each_online_node(nid) {
3555 low_kmem_size = 0; 3565 low_kmem_size = 0;
3556 total_size = 0; 3566 total_size = 0;
3557 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3567 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3558 z = &NODE_DATA(nid)->node_zones[zone_type]; 3568 z = &NODE_DATA(nid)->node_zones[zone_type];
3559 if (populated_zone(z)) { 3569 if (populated_zone(z)) {
3560 if (zone_type < ZONE_NORMAL) 3570 if (zone_type < ZONE_NORMAL)
3561 low_kmem_size += z->present_pages; 3571 low_kmem_size += z->present_pages;
3562 total_size += z->present_pages; 3572 total_size += z->present_pages;
3563 } 3573 }
3564 } 3574 }
3565 if (low_kmem_size && 3575 if (low_kmem_size &&
3566 total_size > average_size && /* ignore small node */ 3576 total_size > average_size && /* ignore small node */
3567 low_kmem_size > total_size * 70/100) 3577 low_kmem_size > total_size * 70/100)
3568 return ZONELIST_ORDER_NODE; 3578 return ZONELIST_ORDER_NODE;
3569 } 3579 }
3570 return ZONELIST_ORDER_ZONE; 3580 return ZONELIST_ORDER_ZONE;
3571 } 3581 }
3572 3582
3573 static void set_zonelist_order(void) 3583 static void set_zonelist_order(void)
3574 { 3584 {
3575 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3585 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3576 current_zonelist_order = default_zonelist_order(); 3586 current_zonelist_order = default_zonelist_order();
3577 else 3587 else
3578 current_zonelist_order = user_zonelist_order; 3588 current_zonelist_order = user_zonelist_order;
3579 } 3589 }
3580 3590
3581 static void build_zonelists(pg_data_t *pgdat) 3591 static void build_zonelists(pg_data_t *pgdat)
3582 { 3592 {
3583 int j, node, load; 3593 int j, node, load;
3584 enum zone_type i; 3594 enum zone_type i;
3585 nodemask_t used_mask; 3595 nodemask_t used_mask;
3586 int local_node, prev_node; 3596 int local_node, prev_node;
3587 struct zonelist *zonelist; 3597 struct zonelist *zonelist;
3588 int order = current_zonelist_order; 3598 int order = current_zonelist_order;
3589 3599
3590 /* initialize zonelists */ 3600 /* initialize zonelists */
3591 for (i = 0; i < MAX_ZONELISTS; i++) { 3601 for (i = 0; i < MAX_ZONELISTS; i++) {
3592 zonelist = pgdat->node_zonelists + i; 3602 zonelist = pgdat->node_zonelists + i;
3593 zonelist->_zonerefs[0].zone = NULL; 3603 zonelist->_zonerefs[0].zone = NULL;
3594 zonelist->_zonerefs[0].zone_idx = 0; 3604 zonelist->_zonerefs[0].zone_idx = 0;
3595 } 3605 }
3596 3606
3597 /* NUMA-aware ordering of nodes */ 3607 /* NUMA-aware ordering of nodes */
3598 local_node = pgdat->node_id; 3608 local_node = pgdat->node_id;
3599 load = nr_online_nodes; 3609 load = nr_online_nodes;
3600 prev_node = local_node; 3610 prev_node = local_node;
3601 nodes_clear(used_mask); 3611 nodes_clear(used_mask);
3602 3612
3603 memset(node_order, 0, sizeof(node_order)); 3613 memset(node_order, 0, sizeof(node_order));
3604 j = 0; 3614 j = 0;
3605 3615
3606 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3616 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3607 /* 3617 /*
3608 * We don't want to pressure a particular node. 3618 * We don't want to pressure a particular node.
3609 * So adding penalty to the first node in same 3619 * So adding penalty to the first node in same
3610 * distance group to make it round-robin. 3620 * distance group to make it round-robin.
3611 */ 3621 */
3612 if (node_distance(local_node, node) != 3622 if (node_distance(local_node, node) !=
3613 node_distance(local_node, prev_node)) 3623 node_distance(local_node, prev_node))
3614 node_load[node] = load; 3624 node_load[node] = load;
3615 3625
3616 prev_node = node; 3626 prev_node = node;
3617 load--; 3627 load--;
3618 if (order == ZONELIST_ORDER_NODE) 3628 if (order == ZONELIST_ORDER_NODE)
3619 build_zonelists_in_node_order(pgdat, node); 3629 build_zonelists_in_node_order(pgdat, node);
3620 else 3630 else
3621 node_order[j++] = node; /* remember order */ 3631 node_order[j++] = node; /* remember order */
3622 } 3632 }
3623 3633
3624 if (order == ZONELIST_ORDER_ZONE) { 3634 if (order == ZONELIST_ORDER_ZONE) {
3625 /* calculate node order -- i.e., DMA last! */ 3635 /* calculate node order -- i.e., DMA last! */
3626 build_zonelists_in_zone_order(pgdat, j); 3636 build_zonelists_in_zone_order(pgdat, j);
3627 } 3637 }
3628 3638
3629 build_thisnode_zonelists(pgdat); 3639 build_thisnode_zonelists(pgdat);
3630 } 3640 }
3631 3641
3632 /* Construct the zonelist performance cache - see further mmzone.h */ 3642 /* Construct the zonelist performance cache - see further mmzone.h */
3633 static void build_zonelist_cache(pg_data_t *pgdat) 3643 static void build_zonelist_cache(pg_data_t *pgdat)
3634 { 3644 {
3635 struct zonelist *zonelist; 3645 struct zonelist *zonelist;
3636 struct zonelist_cache *zlc; 3646 struct zonelist_cache *zlc;
3637 struct zoneref *z; 3647 struct zoneref *z;
3638 3648
3639 zonelist = &pgdat->node_zonelists[0]; 3649 zonelist = &pgdat->node_zonelists[0];
3640 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3650 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3641 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3651 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3642 for (z = zonelist->_zonerefs; z->zone; z++) 3652 for (z = zonelist->_zonerefs; z->zone; z++)
3643 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3653 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3644 } 3654 }
3645 3655
3646 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3656 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3647 /* 3657 /*
3648 * Return node id of node used for "local" allocations. 3658 * Return node id of node used for "local" allocations.
3649 * I.e., first node id of first zone in arg node's generic zonelist. 3659 * I.e., first node id of first zone in arg node's generic zonelist.
3650 * Used for initializing percpu 'numa_mem', which is used primarily 3660 * Used for initializing percpu 'numa_mem', which is used primarily
3651 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3661 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3652 */ 3662 */
3653 int local_memory_node(int node) 3663 int local_memory_node(int node)
3654 { 3664 {
3655 struct zone *zone; 3665 struct zone *zone;
3656 3666
3657 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3667 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3658 gfp_zone(GFP_KERNEL), 3668 gfp_zone(GFP_KERNEL),
3659 NULL, 3669 NULL,
3660 &zone); 3670 &zone);
3661 return zone->node; 3671 return zone->node;
3662 } 3672 }
3663 #endif 3673 #endif
3664 3674
3665 #else /* CONFIG_NUMA */ 3675 #else /* CONFIG_NUMA */
3666 3676
3667 static void set_zonelist_order(void) 3677 static void set_zonelist_order(void)
3668 { 3678 {
3669 current_zonelist_order = ZONELIST_ORDER_ZONE; 3679 current_zonelist_order = ZONELIST_ORDER_ZONE;
3670 } 3680 }
3671 3681
3672 static void build_zonelists(pg_data_t *pgdat) 3682 static void build_zonelists(pg_data_t *pgdat)
3673 { 3683 {
3674 int node, local_node; 3684 int node, local_node;
3675 enum zone_type j; 3685 enum zone_type j;
3676 struct zonelist *zonelist; 3686 struct zonelist *zonelist;
3677 3687
3678 local_node = pgdat->node_id; 3688 local_node = pgdat->node_id;
3679 3689
3680 zonelist = &pgdat->node_zonelists[0]; 3690 zonelist = &pgdat->node_zonelists[0];
3681 j = build_zonelists_node(pgdat, zonelist, 0); 3691 j = build_zonelists_node(pgdat, zonelist, 0);
3682 3692
3683 /* 3693 /*
3684 * Now we build the zonelist so that it contains the zones 3694 * Now we build the zonelist so that it contains the zones
3685 * of all the other nodes. 3695 * of all the other nodes.
3686 * We don't want to pressure a particular node, so when 3696 * We don't want to pressure a particular node, so when
3687 * building the zones for node N, we make sure that the 3697 * building the zones for node N, we make sure that the
3688 * zones coming right after the local ones are those from 3698 * zones coming right after the local ones are those from
3689 * node N+1 (modulo N) 3699 * node N+1 (modulo N)
3690 */ 3700 */
3691 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3701 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3692 if (!node_online(node)) 3702 if (!node_online(node))
3693 continue; 3703 continue;
3694 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3704 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3695 } 3705 }
3696 for (node = 0; node < local_node; node++) { 3706 for (node = 0; node < local_node; node++) {
3697 if (!node_online(node)) 3707 if (!node_online(node))
3698 continue; 3708 continue;
3699 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3709 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3700 } 3710 }
3701 3711
3702 zonelist->_zonerefs[j].zone = NULL; 3712 zonelist->_zonerefs[j].zone = NULL;
3703 zonelist->_zonerefs[j].zone_idx = 0; 3713 zonelist->_zonerefs[j].zone_idx = 0;
3704 } 3714 }
3705 3715
3706 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3716 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3707 static void build_zonelist_cache(pg_data_t *pgdat) 3717 static void build_zonelist_cache(pg_data_t *pgdat)
3708 { 3718 {
3709 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3719 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3710 } 3720 }
3711 3721
3712 #endif /* CONFIG_NUMA */ 3722 #endif /* CONFIG_NUMA */
3713 3723
3714 /* 3724 /*
3715 * Boot pageset table. One per cpu which is going to be used for all 3725 * Boot pageset table. One per cpu which is going to be used for all
3716 * zones and all nodes. The parameters will be set in such a way 3726 * zones and all nodes. The parameters will be set in such a way
3717 * that an item put on a list will immediately be handed over to 3727 * that an item put on a list will immediately be handed over to
3718 * the buddy list. This is safe since pageset manipulation is done 3728 * the buddy list. This is safe since pageset manipulation is done
3719 * with interrupts disabled. 3729 * with interrupts disabled.
3720 * 3730 *
3721 * The boot_pagesets must be kept even after bootup is complete for 3731 * The boot_pagesets must be kept even after bootup is complete for
3722 * unused processors and/or zones. They do play a role for bootstrapping 3732 * unused processors and/or zones. They do play a role for bootstrapping
3723 * hotplugged processors. 3733 * hotplugged processors.
3724 * 3734 *
3725 * zoneinfo_show() and maybe other functions do 3735 * zoneinfo_show() and maybe other functions do
3726 * not check if the processor is online before following the pageset pointer. 3736 * not check if the processor is online before following the pageset pointer.
3727 * Other parts of the kernel may not check if the zone is available. 3737 * Other parts of the kernel may not check if the zone is available.
3728 */ 3738 */
3729 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3739 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3730 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3740 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3731 static void setup_zone_pageset(struct zone *zone); 3741 static void setup_zone_pageset(struct zone *zone);
3732 3742
3733 /* 3743 /*
3734 * Global mutex to protect against size modification of zonelists 3744 * Global mutex to protect against size modification of zonelists
3735 * as well as to serialize pageset setup for the new populated zone. 3745 * as well as to serialize pageset setup for the new populated zone.
3736 */ 3746 */
3737 DEFINE_MUTEX(zonelists_mutex); 3747 DEFINE_MUTEX(zonelists_mutex);
3738 3748
3739 /* return values int ....just for stop_machine() */ 3749 /* return values int ....just for stop_machine() */
3740 static int __build_all_zonelists(void *data) 3750 static int __build_all_zonelists(void *data)
3741 { 3751 {
3742 int nid; 3752 int nid;
3743 int cpu; 3753 int cpu;
3744 pg_data_t *self = data; 3754 pg_data_t *self = data;
3745 3755
3746 #ifdef CONFIG_NUMA 3756 #ifdef CONFIG_NUMA
3747 memset(node_load, 0, sizeof(node_load)); 3757 memset(node_load, 0, sizeof(node_load));
3748 #endif 3758 #endif
3749 3759
3750 if (self && !node_online(self->node_id)) { 3760 if (self && !node_online(self->node_id)) {
3751 build_zonelists(self); 3761 build_zonelists(self);
3752 build_zonelist_cache(self); 3762 build_zonelist_cache(self);
3753 } 3763 }
3754 3764
3755 for_each_online_node(nid) { 3765 for_each_online_node(nid) {
3756 pg_data_t *pgdat = NODE_DATA(nid); 3766 pg_data_t *pgdat = NODE_DATA(nid);
3757 3767
3758 build_zonelists(pgdat); 3768 build_zonelists(pgdat);
3759 build_zonelist_cache(pgdat); 3769 build_zonelist_cache(pgdat);
3760 } 3770 }
3761 3771
3762 /* 3772 /*
3763 * Initialize the boot_pagesets that are going to be used 3773 * Initialize the boot_pagesets that are going to be used
3764 * for bootstrapping processors. The real pagesets for 3774 * for bootstrapping processors. The real pagesets for
3765 * each zone will be allocated later when the per cpu 3775 * each zone will be allocated later when the per cpu
3766 * allocator is available. 3776 * allocator is available.
3767 * 3777 *
3768 * boot_pagesets are used also for bootstrapping offline 3778 * boot_pagesets are used also for bootstrapping offline
3769 * cpus if the system is already booted because the pagesets 3779 * cpus if the system is already booted because the pagesets
3770 * are needed to initialize allocators on a specific cpu too. 3780 * are needed to initialize allocators on a specific cpu too.
3771 * F.e. the percpu allocator needs the page allocator which 3781 * F.e. the percpu allocator needs the page allocator which
3772 * needs the percpu allocator in order to allocate its pagesets 3782 * needs the percpu allocator in order to allocate its pagesets
3773 * (a chicken-egg dilemma). 3783 * (a chicken-egg dilemma).
3774 */ 3784 */
3775 for_each_possible_cpu(cpu) { 3785 for_each_possible_cpu(cpu) {
3776 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3786 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3777 3787
3778 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3788 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3779 /* 3789 /*
3780 * We now know the "local memory node" for each node-- 3790 * We now know the "local memory node" for each node--
3781 * i.e., the node of the first zone in the generic zonelist. 3791 * i.e., the node of the first zone in the generic zonelist.
3782 * Set up numa_mem percpu variable for on-line cpus. During 3792 * Set up numa_mem percpu variable for on-line cpus. During
3783 * boot, only the boot cpu should be on-line; we'll init the 3793 * boot, only the boot cpu should be on-line; we'll init the
3784 * secondary cpus' numa_mem as they come on-line. During 3794 * secondary cpus' numa_mem as they come on-line. During
3785 * node/memory hotplug, we'll fixup all on-line cpus. 3795 * node/memory hotplug, we'll fixup all on-line cpus.
3786 */ 3796 */
3787 if (cpu_online(cpu)) 3797 if (cpu_online(cpu))
3788 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3798 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3789 #endif 3799 #endif
3790 } 3800 }
3791 3801
3792 return 0; 3802 return 0;
3793 } 3803 }
3794 3804
3795 /* 3805 /*
3796 * Called with zonelists_mutex held always 3806 * Called with zonelists_mutex held always
3797 * unless system_state == SYSTEM_BOOTING. 3807 * unless system_state == SYSTEM_BOOTING.
3798 */ 3808 */
3799 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3809 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3800 { 3810 {
3801 set_zonelist_order(); 3811 set_zonelist_order();
3802 3812
3803 if (system_state == SYSTEM_BOOTING) { 3813 if (system_state == SYSTEM_BOOTING) {
3804 __build_all_zonelists(NULL); 3814 __build_all_zonelists(NULL);
3805 mminit_verify_zonelist(); 3815 mminit_verify_zonelist();
3806 cpuset_init_current_mems_allowed(); 3816 cpuset_init_current_mems_allowed();
3807 } else { 3817 } else {
3808 #ifdef CONFIG_MEMORY_HOTPLUG 3818 #ifdef CONFIG_MEMORY_HOTPLUG
3809 if (zone) 3819 if (zone)
3810 setup_zone_pageset(zone); 3820 setup_zone_pageset(zone);
3811 #endif 3821 #endif
3812 /* we have to stop all cpus to guarantee there is no user 3822 /* we have to stop all cpus to guarantee there is no user
3813 of zonelist */ 3823 of zonelist */
3814 stop_machine(__build_all_zonelists, pgdat, NULL); 3824 stop_machine(__build_all_zonelists, pgdat, NULL);
3815 /* cpuset refresh routine should be here */ 3825 /* cpuset refresh routine should be here */
3816 } 3826 }
3817 vm_total_pages = nr_free_pagecache_pages(); 3827 vm_total_pages = nr_free_pagecache_pages();
3818 /* 3828 /*
3819 * Disable grouping by mobility if the number of pages in the 3829 * Disable grouping by mobility if the number of pages in the
3820 * system is too low to allow the mechanism to work. It would be 3830 * system is too low to allow the mechanism to work. It would be
3821 * more accurate, but expensive to check per-zone. This check is 3831 * more accurate, but expensive to check per-zone. This check is
3822 * made on memory-hotadd so a system can start with mobility 3832 * made on memory-hotadd so a system can start with mobility
3823 * disabled and enable it later 3833 * disabled and enable it later
3824 */ 3834 */
3825 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3835 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3826 page_group_by_mobility_disabled = 1; 3836 page_group_by_mobility_disabled = 1;
3827 else 3837 else
3828 page_group_by_mobility_disabled = 0; 3838 page_group_by_mobility_disabled = 0;
3829 3839
3830 printk("Built %i zonelists in %s order, mobility grouping %s. " 3840 printk("Built %i zonelists in %s order, mobility grouping %s. "
3831 "Total pages: %ld\n", 3841 "Total pages: %ld\n",
3832 nr_online_nodes, 3842 nr_online_nodes,
3833 zonelist_order_name[current_zonelist_order], 3843 zonelist_order_name[current_zonelist_order],
3834 page_group_by_mobility_disabled ? "off" : "on", 3844 page_group_by_mobility_disabled ? "off" : "on",
3835 vm_total_pages); 3845 vm_total_pages);
3836 #ifdef CONFIG_NUMA 3846 #ifdef CONFIG_NUMA
3837 printk("Policy zone: %s\n", zone_names[policy_zone]); 3847 printk("Policy zone: %s\n", zone_names[policy_zone]);
3838 #endif 3848 #endif
3839 } 3849 }
3840 3850
3841 /* 3851 /*
3842 * Helper functions to size the waitqueue hash table. 3852 * Helper functions to size the waitqueue hash table.
3843 * Essentially these want to choose hash table sizes sufficiently 3853 * Essentially these want to choose hash table sizes sufficiently
3844 * large so that collisions trying to wait on pages are rare. 3854 * large so that collisions trying to wait on pages are rare.
3845 * But in fact, the number of active page waitqueues on typical 3855 * But in fact, the number of active page waitqueues on typical
3846 * systems is ridiculously low, less than 200. So this is even 3856 * systems is ridiculously low, less than 200. So this is even
3847 * conservative, even though it seems large. 3857 * conservative, even though it seems large.
3848 * 3858 *
3849 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3859 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3850 * waitqueues, i.e. the size of the waitq table given the number of pages. 3860 * waitqueues, i.e. the size of the waitq table given the number of pages.
3851 */ 3861 */
3852 #define PAGES_PER_WAITQUEUE 256 3862 #define PAGES_PER_WAITQUEUE 256
3853 3863
3854 #ifndef CONFIG_MEMORY_HOTPLUG 3864 #ifndef CONFIG_MEMORY_HOTPLUG
3855 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3865 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3856 { 3866 {
3857 unsigned long size = 1; 3867 unsigned long size = 1;
3858 3868
3859 pages /= PAGES_PER_WAITQUEUE; 3869 pages /= PAGES_PER_WAITQUEUE;
3860 3870
3861 while (size < pages) 3871 while (size < pages)
3862 size <<= 1; 3872 size <<= 1;
3863 3873
3864 /* 3874 /*
3865 * Once we have dozens or even hundreds of threads sleeping 3875 * Once we have dozens or even hundreds of threads sleeping
3866 * on IO we've got bigger problems than wait queue collision. 3876 * on IO we've got bigger problems than wait queue collision.
3867 * Limit the size of the wait table to a reasonable size. 3877 * Limit the size of the wait table to a reasonable size.
3868 */ 3878 */
3869 size = min(size, 4096UL); 3879 size = min(size, 4096UL);
3870 3880
3871 return max(size, 4UL); 3881 return max(size, 4UL);
3872 } 3882 }
3873 #else 3883 #else
3874 /* 3884 /*
3875 * A zone's size might be changed by hot-add, so it is not possible to determine 3885 * A zone's size might be changed by hot-add, so it is not possible to determine
3876 * a suitable size for its wait_table. So we use the maximum size now. 3886 * a suitable size for its wait_table. So we use the maximum size now.
3877 * 3887 *
3878 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3888 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3879 * 3889 *
3880 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3890 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3881 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3891 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3882 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3892 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3883 * 3893 *
3884 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3894 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3885 * or more by the traditional way. (See above). It equals: 3895 * or more by the traditional way. (See above). It equals:
3886 * 3896 *
3887 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3897 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3888 * ia64(16K page size) : = ( 8G + 4M)byte. 3898 * ia64(16K page size) : = ( 8G + 4M)byte.
3889 * powerpc (64K page size) : = (32G +16M)byte. 3899 * powerpc (64K page size) : = (32G +16M)byte.
3890 */ 3900 */
3891 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3901 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3892 { 3902 {
3893 return 4096UL; 3903 return 4096UL;
3894 } 3904 }
3895 #endif 3905 #endif
3896 3906
3897 /* 3907 /*
3898 * This is an integer logarithm so that shifts can be used later 3908 * This is an integer logarithm so that shifts can be used later
3899 * to extract the more random high bits from the multiplicative 3909 * to extract the more random high bits from the multiplicative
3900 * hash function before the remainder is taken. 3910 * hash function before the remainder is taken.
3901 */ 3911 */
3902 static inline unsigned long wait_table_bits(unsigned long size) 3912 static inline unsigned long wait_table_bits(unsigned long size)
3903 { 3913 {
3904 return ffz(~size); 3914 return ffz(~size);
3905 } 3915 }
3906 3916
3907 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3917 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3908 3918
3909 /* 3919 /*
3910 * Check if a pageblock contains reserved pages 3920 * Check if a pageblock contains reserved pages
3911 */ 3921 */
3912 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3922 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3913 { 3923 {
3914 unsigned long pfn; 3924 unsigned long pfn;
3915 3925
3916 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3926 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3917 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3927 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3918 return 1; 3928 return 1;
3919 } 3929 }
3920 return 0; 3930 return 0;
3921 } 3931 }
3922 3932
3923 /* 3933 /*
3924 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3934 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3925 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3935 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3926 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3936 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3927 * higher will lead to a bigger reserve which will get freed as contiguous 3937 * higher will lead to a bigger reserve which will get freed as contiguous
3928 * blocks as reclaim kicks in 3938 * blocks as reclaim kicks in
3929 */ 3939 */
3930 static void setup_zone_migrate_reserve(struct zone *zone) 3940 static void setup_zone_migrate_reserve(struct zone *zone)
3931 { 3941 {
3932 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3942 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3933 struct page *page; 3943 struct page *page;
3934 unsigned long block_migratetype; 3944 unsigned long block_migratetype;
3935 int reserve; 3945 int reserve;
3936 int old_reserve; 3946 int old_reserve;
3937 3947
3938 /* 3948 /*
3939 * Get the start pfn, end pfn and the number of blocks to reserve 3949 * Get the start pfn, end pfn and the number of blocks to reserve
3940 * We have to be careful to be aligned to pageblock_nr_pages to 3950 * We have to be careful to be aligned to pageblock_nr_pages to
3941 * make sure that we always check pfn_valid for the first page in 3951 * make sure that we always check pfn_valid for the first page in
3942 * the block. 3952 * the block.
3943 */ 3953 */
3944 start_pfn = zone->zone_start_pfn; 3954 start_pfn = zone->zone_start_pfn;
3945 end_pfn = zone_end_pfn(zone); 3955 end_pfn = zone_end_pfn(zone);
3946 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3956 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3947 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3957 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3948 pageblock_order; 3958 pageblock_order;
3949 3959
3950 /* 3960 /*
3951 * Reserve blocks are generally in place to help high-order atomic 3961 * Reserve blocks are generally in place to help high-order atomic
3952 * allocations that are short-lived. A min_free_kbytes value that 3962 * allocations that are short-lived. A min_free_kbytes value that
3953 * would result in more than 2 reserve blocks for atomic allocations 3963 * would result in more than 2 reserve blocks for atomic allocations
3954 * is assumed to be in place to help anti-fragmentation for the 3964 * is assumed to be in place to help anti-fragmentation for the
3955 * future allocation of hugepages at runtime. 3965 * future allocation of hugepages at runtime.
3956 */ 3966 */
3957 reserve = min(2, reserve); 3967 reserve = min(2, reserve);
3958 old_reserve = zone->nr_migrate_reserve_block; 3968 old_reserve = zone->nr_migrate_reserve_block;
3959 3969
3960 /* When memory hot-add, we almost always need to do nothing */ 3970 /* When memory hot-add, we almost always need to do nothing */
3961 if (reserve == old_reserve) 3971 if (reserve == old_reserve)
3962 return; 3972 return;
3963 zone->nr_migrate_reserve_block = reserve; 3973 zone->nr_migrate_reserve_block = reserve;
3964 3974
3965 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3975 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3966 if (!pfn_valid(pfn)) 3976 if (!pfn_valid(pfn))
3967 continue; 3977 continue;
3968 page = pfn_to_page(pfn); 3978 page = pfn_to_page(pfn);
3969 3979
3970 /* Watch out for overlapping nodes */ 3980 /* Watch out for overlapping nodes */
3971 if (page_to_nid(page) != zone_to_nid(zone)) 3981 if (page_to_nid(page) != zone_to_nid(zone))
3972 continue; 3982 continue;
3973 3983
3974 block_migratetype = get_pageblock_migratetype(page); 3984 block_migratetype = get_pageblock_migratetype(page);
3975 3985
3976 /* Only test what is necessary when the reserves are not met */ 3986 /* Only test what is necessary when the reserves are not met */
3977 if (reserve > 0) { 3987 if (reserve > 0) {
3978 /* 3988 /*
3979 * Blocks with reserved pages will never free, skip 3989 * Blocks with reserved pages will never free, skip
3980 * them. 3990 * them.
3981 */ 3991 */
3982 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3992 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3983 if (pageblock_is_reserved(pfn, block_end_pfn)) 3993 if (pageblock_is_reserved(pfn, block_end_pfn))
3984 continue; 3994 continue;
3985 3995
3986 /* If this block is reserved, account for it */ 3996 /* If this block is reserved, account for it */
3987 if (block_migratetype == MIGRATE_RESERVE) { 3997 if (block_migratetype == MIGRATE_RESERVE) {
3988 reserve--; 3998 reserve--;
3989 continue; 3999 continue;
3990 } 4000 }
3991 4001
3992 /* Suitable for reserving if this block is movable */ 4002 /* Suitable for reserving if this block is movable */
3993 if (block_migratetype == MIGRATE_MOVABLE) { 4003 if (block_migratetype == MIGRATE_MOVABLE) {
3994 set_pageblock_migratetype(page, 4004 set_pageblock_migratetype(page,
3995 MIGRATE_RESERVE); 4005 MIGRATE_RESERVE);
3996 move_freepages_block(zone, page, 4006 move_freepages_block(zone, page,
3997 MIGRATE_RESERVE); 4007 MIGRATE_RESERVE);
3998 reserve--; 4008 reserve--;
3999 continue; 4009 continue;
4000 } 4010 }
4001 } else if (!old_reserve) { 4011 } else if (!old_reserve) {
4002 /* 4012 /*
4003 * At boot time we don't need to scan the whole zone 4013 * At boot time we don't need to scan the whole zone
4004 * for turning off MIGRATE_RESERVE. 4014 * for turning off MIGRATE_RESERVE.
4005 */ 4015 */
4006 break; 4016 break;
4007 } 4017 }
4008 4018
4009 /* 4019 /*
4010 * If the reserve is met and this is a previous reserved block, 4020 * If the reserve is met and this is a previous reserved block,
4011 * take it back 4021 * take it back
4012 */ 4022 */
4013 if (block_migratetype == MIGRATE_RESERVE) { 4023 if (block_migratetype == MIGRATE_RESERVE) {
4014 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4024 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4015 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4025 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4016 } 4026 }
4017 } 4027 }
4018 } 4028 }
4019 4029
4020 /* 4030 /*
4021 * Initially all pages are reserved - free ones are freed 4031 * Initially all pages are reserved - free ones are freed
4022 * up by free_all_bootmem() once the early boot process is 4032 * up by free_all_bootmem() once the early boot process is
4023 * done. Non-atomic initialization, single-pass. 4033 * done. Non-atomic initialization, single-pass.
4024 */ 4034 */
4025 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4035 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4026 unsigned long start_pfn, enum memmap_context context) 4036 unsigned long start_pfn, enum memmap_context context)
4027 { 4037 {
4028 struct page *page; 4038 struct page *page;
4029 unsigned long end_pfn = start_pfn + size; 4039 unsigned long end_pfn = start_pfn + size;
4030 unsigned long pfn; 4040 unsigned long pfn;
4031 struct zone *z; 4041 struct zone *z;
4032 4042
4033 if (highest_memmap_pfn < end_pfn - 1) 4043 if (highest_memmap_pfn < end_pfn - 1)
4034 highest_memmap_pfn = end_pfn - 1; 4044 highest_memmap_pfn = end_pfn - 1;
4035 4045
4036 z = &NODE_DATA(nid)->node_zones[zone]; 4046 z = &NODE_DATA(nid)->node_zones[zone];
4037 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4047 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4038 /* 4048 /*
4039 * There can be holes in boot-time mem_map[]s 4049 * There can be holes in boot-time mem_map[]s
4040 * handed to this function. They do not 4050 * handed to this function. They do not
4041 * exist on hotplugged memory. 4051 * exist on hotplugged memory.
4042 */ 4052 */
4043 if (context == MEMMAP_EARLY) { 4053 if (context == MEMMAP_EARLY) {
4044 if (!early_pfn_valid(pfn)) 4054 if (!early_pfn_valid(pfn))
4045 continue; 4055 continue;
4046 if (!early_pfn_in_nid(pfn, nid)) 4056 if (!early_pfn_in_nid(pfn, nid))
4047 continue; 4057 continue;
4048 } 4058 }
4049 page = pfn_to_page(pfn); 4059 page = pfn_to_page(pfn);
4050 set_page_links(page, zone, nid, pfn); 4060 set_page_links(page, zone, nid, pfn);
4051 mminit_verify_page_links(page, zone, nid, pfn); 4061 mminit_verify_page_links(page, zone, nid, pfn);
4052 init_page_count(page); 4062 init_page_count(page);
4053 page_mapcount_reset(page); 4063 page_mapcount_reset(page);
4054 page_nid_reset_last(page); 4064 page_nid_reset_last(page);
4055 SetPageReserved(page); 4065 SetPageReserved(page);
4056 /* 4066 /*
4057 * Mark the block movable so that blocks are reserved for 4067 * Mark the block movable so that blocks are reserved for
4058 * movable at startup. This will force kernel allocations 4068 * movable at startup. This will force kernel allocations
4059 * to reserve their blocks rather than leaking throughout 4069 * to reserve their blocks rather than leaking throughout
4060 * the address space during boot when many long-lived 4070 * the address space during boot when many long-lived
4061 * kernel allocations are made. Later some blocks near 4071 * kernel allocations are made. Later some blocks near
4062 * the start are marked MIGRATE_RESERVE by 4072 * the start are marked MIGRATE_RESERVE by
4063 * setup_zone_migrate_reserve() 4073 * setup_zone_migrate_reserve()
4064 * 4074 *
4065 * bitmap is created for zone's valid pfn range. but memmap 4075 * bitmap is created for zone's valid pfn range. but memmap
4066 * can be created for invalid pages (for alignment) 4076 * can be created for invalid pages (for alignment)
4067 * check here not to call set_pageblock_migratetype() against 4077 * check here not to call set_pageblock_migratetype() against
4068 * pfn out of zone. 4078 * pfn out of zone.
4069 */ 4079 */
4070 if ((z->zone_start_pfn <= pfn) 4080 if ((z->zone_start_pfn <= pfn)
4071 && (pfn < zone_end_pfn(z)) 4081 && (pfn < zone_end_pfn(z))
4072 && !(pfn & (pageblock_nr_pages - 1))) 4082 && !(pfn & (pageblock_nr_pages - 1)))
4073 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4083 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4074 4084
4075 INIT_LIST_HEAD(&page->lru); 4085 INIT_LIST_HEAD(&page->lru);
4076 #ifdef WANT_PAGE_VIRTUAL 4086 #ifdef WANT_PAGE_VIRTUAL
4077 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4087 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4078 if (!is_highmem_idx(zone)) 4088 if (!is_highmem_idx(zone))
4079 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4089 set_page_address(page, __va(pfn << PAGE_SHIFT));
4080 #endif 4090 #endif
4081 } 4091 }
4082 } 4092 }
4083 4093
4084 static void __meminit zone_init_free_lists(struct zone *zone) 4094 static void __meminit zone_init_free_lists(struct zone *zone)
4085 { 4095 {
4086 int order, t; 4096 int order, t;
4087 for_each_migratetype_order(order, t) { 4097 for_each_migratetype_order(order, t) {
4088 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4098 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4089 zone->free_area[order].nr_free = 0; 4099 zone->free_area[order].nr_free = 0;
4090 } 4100 }
4091 } 4101 }
4092 4102
4093 #ifndef __HAVE_ARCH_MEMMAP_INIT 4103 #ifndef __HAVE_ARCH_MEMMAP_INIT
4094 #define memmap_init(size, nid, zone, start_pfn) \ 4104 #define memmap_init(size, nid, zone, start_pfn) \
4095 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4105 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4096 #endif 4106 #endif
4097 4107
4098 static int zone_batchsize(struct zone *zone) 4108 static int zone_batchsize(struct zone *zone)
4099 { 4109 {
4100 #ifdef CONFIG_MMU 4110 #ifdef CONFIG_MMU
4101 int batch; 4111 int batch;
4102 4112
4103 /* 4113 /*
4104 * The per-cpu-pages pools are set to around 1000th of the 4114 * The per-cpu-pages pools are set to around 1000th of the
4105 * size of the zone. But no more than 1/2 of a meg. 4115 * size of the zone. But no more than 1/2 of a meg.
4106 * 4116 *
4107 * OK, so we don't know how big the cache is. So guess. 4117 * OK, so we don't know how big the cache is. So guess.
4108 */ 4118 */
4109 batch = zone->managed_pages / 1024; 4119 batch = zone->managed_pages / 1024;
4110 if (batch * PAGE_SIZE > 512 * 1024) 4120 if (batch * PAGE_SIZE > 512 * 1024)
4111 batch = (512 * 1024) / PAGE_SIZE; 4121 batch = (512 * 1024) / PAGE_SIZE;
4112 batch /= 4; /* We effectively *= 4 below */ 4122 batch /= 4; /* We effectively *= 4 below */
4113 if (batch < 1) 4123 if (batch < 1)
4114 batch = 1; 4124 batch = 1;
4115 4125
4116 /* 4126 /*
4117 * Clamp the batch to a 2^n - 1 value. Having a power 4127 * Clamp the batch to a 2^n - 1 value. Having a power
4118 * of 2 value was found to be more likely to have 4128 * of 2 value was found to be more likely to have
4119 * suboptimal cache aliasing properties in some cases. 4129 * suboptimal cache aliasing properties in some cases.
4120 * 4130 *
4121 * For example if 2 tasks are alternately allocating 4131 * For example if 2 tasks are alternately allocating
4122 * batches of pages, one task can end up with a lot 4132 * batches of pages, one task can end up with a lot
4123 * of pages of one half of the possible page colors 4133 * of pages of one half of the possible page colors
4124 * and the other with pages of the other colors. 4134 * and the other with pages of the other colors.
4125 */ 4135 */
4126 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4136 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4127 4137
4128 return batch; 4138 return batch;
4129 4139
4130 #else 4140 #else
4131 /* The deferral and batching of frees should be suppressed under NOMMU 4141 /* The deferral and batching of frees should be suppressed under NOMMU
4132 * conditions. 4142 * conditions.
4133 * 4143 *
4134 * The problem is that NOMMU needs to be able to allocate large chunks 4144 * The problem is that NOMMU needs to be able to allocate large chunks
4135 * of contiguous memory as there's no hardware page translation to 4145 * of contiguous memory as there's no hardware page translation to
4136 * assemble apparent contiguous memory from discontiguous pages. 4146 * assemble apparent contiguous memory from discontiguous pages.
4137 * 4147 *
4138 * Queueing large contiguous runs of pages for batching, however, 4148 * Queueing large contiguous runs of pages for batching, however,
4139 * causes the pages to actually be freed in smaller chunks. As there 4149 * causes the pages to actually be freed in smaller chunks. As there
4140 * can be a significant delay between the individual batches being 4150 * can be a significant delay between the individual batches being
4141 * recycled, this leads to the once large chunks of space being 4151 * recycled, this leads to the once large chunks of space being
4142 * fragmented and becoming unavailable for high-order allocations. 4152 * fragmented and becoming unavailable for high-order allocations.
4143 */ 4153 */
4144 return 0; 4154 return 0;
4145 #endif 4155 #endif
4146 } 4156 }
4147 4157
4148 /* 4158 /*
4149 * pcp->high and pcp->batch values are related and dependent on one another: 4159 * pcp->high and pcp->batch values are related and dependent on one another:
4150 * ->batch must never be higher then ->high. 4160 * ->batch must never be higher then ->high.
4151 * The following function updates them in a safe manner without read side 4161 * The following function updates them in a safe manner without read side
4152 * locking. 4162 * locking.
4153 * 4163 *
4154 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4164 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4155 * those fields changing asynchronously (acording the the above rule). 4165 * those fields changing asynchronously (acording the the above rule).
4156 * 4166 *
4157 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4167 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4158 * outside of boot time (or some other assurance that no concurrent updaters 4168 * outside of boot time (or some other assurance that no concurrent updaters
4159 * exist). 4169 * exist).
4160 */ 4170 */
4161 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4171 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4162 unsigned long batch) 4172 unsigned long batch)
4163 { 4173 {
4164 /* start with a fail safe value for batch */ 4174 /* start with a fail safe value for batch */
4165 pcp->batch = 1; 4175 pcp->batch = 1;
4166 smp_wmb(); 4176 smp_wmb();
4167 4177
4168 /* Update high, then batch, in order */ 4178 /* Update high, then batch, in order */
4169 pcp->high = high; 4179 pcp->high = high;
4170 smp_wmb(); 4180 smp_wmb();
4171 4181
4172 pcp->batch = batch; 4182 pcp->batch = batch;
4173 } 4183 }
4174 4184
4175 /* a companion to pageset_set_high() */ 4185 /* a companion to pageset_set_high() */
4176 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4186 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4177 { 4187 {
4178 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4188 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4179 } 4189 }
4180 4190
4181 static void pageset_init(struct per_cpu_pageset *p) 4191 static void pageset_init(struct per_cpu_pageset *p)
4182 { 4192 {
4183 struct per_cpu_pages *pcp; 4193 struct per_cpu_pages *pcp;
4184 int migratetype; 4194 int migratetype;
4185 4195
4186 memset(p, 0, sizeof(*p)); 4196 memset(p, 0, sizeof(*p));
4187 4197
4188 pcp = &p->pcp; 4198 pcp = &p->pcp;
4189 pcp->count = 0; 4199 pcp->count = 0;
4190 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4200 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4191 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4201 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4192 } 4202 }
4193 4203
4194 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4204 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4195 { 4205 {
4196 pageset_init(p); 4206 pageset_init(p);
4197 pageset_set_batch(p, batch); 4207 pageset_set_batch(p, batch);
4198 } 4208 }
4199 4209
4200 /* 4210 /*
4201 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4211 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4202 * to the value high for the pageset p. 4212 * to the value high for the pageset p.
4203 */ 4213 */
4204 static void pageset_set_high(struct per_cpu_pageset *p, 4214 static void pageset_set_high(struct per_cpu_pageset *p,
4205 unsigned long high) 4215 unsigned long high)
4206 { 4216 {
4207 unsigned long batch = max(1UL, high / 4); 4217 unsigned long batch = max(1UL, high / 4);
4208 if ((high / 4) > (PAGE_SHIFT * 8)) 4218 if ((high / 4) > (PAGE_SHIFT * 8))
4209 batch = PAGE_SHIFT * 8; 4219 batch = PAGE_SHIFT * 8;
4210 4220
4211 pageset_update(&p->pcp, high, batch); 4221 pageset_update(&p->pcp, high, batch);
4212 } 4222 }
4213 4223
4214 static void pageset_set_high_and_batch(struct zone *zone, 4224 static void pageset_set_high_and_batch(struct zone *zone,
4215 struct per_cpu_pageset *pcp) 4225 struct per_cpu_pageset *pcp)
4216 { 4226 {
4217 if (percpu_pagelist_fraction) 4227 if (percpu_pagelist_fraction)
4218 pageset_set_high(pcp, 4228 pageset_set_high(pcp,
4219 (zone->managed_pages / 4229 (zone->managed_pages /
4220 percpu_pagelist_fraction)); 4230 percpu_pagelist_fraction));
4221 else 4231 else
4222 pageset_set_batch(pcp, zone_batchsize(zone)); 4232 pageset_set_batch(pcp, zone_batchsize(zone));
4223 } 4233 }
4224 4234
4225 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4235 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4226 { 4236 {
4227 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4237 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4228 4238
4229 pageset_init(pcp); 4239 pageset_init(pcp);
4230 pageset_set_high_and_batch(zone, pcp); 4240 pageset_set_high_and_batch(zone, pcp);
4231 } 4241 }
4232 4242
4233 static void __meminit setup_zone_pageset(struct zone *zone) 4243 static void __meminit setup_zone_pageset(struct zone *zone)
4234 { 4244 {
4235 int cpu; 4245 int cpu;
4236 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4246 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4237 for_each_possible_cpu(cpu) 4247 for_each_possible_cpu(cpu)
4238 zone_pageset_init(zone, cpu); 4248 zone_pageset_init(zone, cpu);
4239 } 4249 }
4240 4250
4241 /* 4251 /*
4242 * Allocate per cpu pagesets and initialize them. 4252 * Allocate per cpu pagesets and initialize them.
4243 * Before this call only boot pagesets were available. 4253 * Before this call only boot pagesets were available.
4244 */ 4254 */
4245 void __init setup_per_cpu_pageset(void) 4255 void __init setup_per_cpu_pageset(void)
4246 { 4256 {
4247 struct zone *zone; 4257 struct zone *zone;
4248 4258
4249 for_each_populated_zone(zone) 4259 for_each_populated_zone(zone)
4250 setup_zone_pageset(zone); 4260 setup_zone_pageset(zone);
4251 } 4261 }
4252 4262
4253 static noinline __init_refok 4263 static noinline __init_refok
4254 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4264 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4255 { 4265 {
4256 int i; 4266 int i;
4257 struct pglist_data *pgdat = zone->zone_pgdat; 4267 struct pglist_data *pgdat = zone->zone_pgdat;
4258 size_t alloc_size; 4268 size_t alloc_size;
4259 4269
4260 /* 4270 /*
4261 * The per-page waitqueue mechanism uses hashed waitqueues 4271 * The per-page waitqueue mechanism uses hashed waitqueues
4262 * per zone. 4272 * per zone.
4263 */ 4273 */
4264 zone->wait_table_hash_nr_entries = 4274 zone->wait_table_hash_nr_entries =
4265 wait_table_hash_nr_entries(zone_size_pages); 4275 wait_table_hash_nr_entries(zone_size_pages);
4266 zone->wait_table_bits = 4276 zone->wait_table_bits =
4267 wait_table_bits(zone->wait_table_hash_nr_entries); 4277 wait_table_bits(zone->wait_table_hash_nr_entries);
4268 alloc_size = zone->wait_table_hash_nr_entries 4278 alloc_size = zone->wait_table_hash_nr_entries
4269 * sizeof(wait_queue_head_t); 4279 * sizeof(wait_queue_head_t);
4270 4280
4271 if (!slab_is_available()) { 4281 if (!slab_is_available()) {
4272 zone->wait_table = (wait_queue_head_t *) 4282 zone->wait_table = (wait_queue_head_t *)
4273 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4283 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4274 } else { 4284 } else {
4275 /* 4285 /*
4276 * This case means that a zone whose size was 0 gets new memory 4286 * This case means that a zone whose size was 0 gets new memory
4277 * via memory hot-add. 4287 * via memory hot-add.
4278 * But it may be the case that a new node was hot-added. In 4288 * But it may be the case that a new node was hot-added. In
4279 * this case vmalloc() will not be able to use this new node's 4289 * this case vmalloc() will not be able to use this new node's
4280 * memory - this wait_table must be initialized to use this new 4290 * memory - this wait_table must be initialized to use this new
4281 * node itself as well. 4291 * node itself as well.
4282 * To use this new node's memory, further consideration will be 4292 * To use this new node's memory, further consideration will be
4283 * necessary. 4293 * necessary.
4284 */ 4294 */
4285 zone->wait_table = vmalloc(alloc_size); 4295 zone->wait_table = vmalloc(alloc_size);
4286 } 4296 }
4287 if (!zone->wait_table) 4297 if (!zone->wait_table)
4288 return -ENOMEM; 4298 return -ENOMEM;
4289 4299
4290 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4300 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4291 init_waitqueue_head(zone->wait_table + i); 4301 init_waitqueue_head(zone->wait_table + i);
4292 4302
4293 return 0; 4303 return 0;
4294 } 4304 }
4295 4305
4296 static __meminit void zone_pcp_init(struct zone *zone) 4306 static __meminit void zone_pcp_init(struct zone *zone)
4297 { 4307 {
4298 /* 4308 /*
4299 * per cpu subsystem is not up at this point. The following code 4309 * per cpu subsystem is not up at this point. The following code
4300 * relies on the ability of the linker to provide the 4310 * relies on the ability of the linker to provide the
4301 * offset of a (static) per cpu variable into the per cpu area. 4311 * offset of a (static) per cpu variable into the per cpu area.
4302 */ 4312 */
4303 zone->pageset = &boot_pageset; 4313 zone->pageset = &boot_pageset;
4304 4314
4305 if (zone->present_pages) 4315 if (zone->present_pages)
4306 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4316 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4307 zone->name, zone->present_pages, 4317 zone->name, zone->present_pages,
4308 zone_batchsize(zone)); 4318 zone_batchsize(zone));
4309 } 4319 }
4310 4320
4311 int __meminit init_currently_empty_zone(struct zone *zone, 4321 int __meminit init_currently_empty_zone(struct zone *zone,
4312 unsigned long zone_start_pfn, 4322 unsigned long zone_start_pfn,
4313 unsigned long size, 4323 unsigned long size,
4314 enum memmap_context context) 4324 enum memmap_context context)
4315 { 4325 {
4316 struct pglist_data *pgdat = zone->zone_pgdat; 4326 struct pglist_data *pgdat = zone->zone_pgdat;
4317 int ret; 4327 int ret;
4318 ret = zone_wait_table_init(zone, size); 4328 ret = zone_wait_table_init(zone, size);
4319 if (ret) 4329 if (ret)
4320 return ret; 4330 return ret;
4321 pgdat->nr_zones = zone_idx(zone) + 1; 4331 pgdat->nr_zones = zone_idx(zone) + 1;
4322 4332
4323 zone->zone_start_pfn = zone_start_pfn; 4333 zone->zone_start_pfn = zone_start_pfn;
4324 4334
4325 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4335 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4326 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4336 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4327 pgdat->node_id, 4337 pgdat->node_id,
4328 (unsigned long)zone_idx(zone), 4338 (unsigned long)zone_idx(zone),
4329 zone_start_pfn, (zone_start_pfn + size)); 4339 zone_start_pfn, (zone_start_pfn + size));
4330 4340
4331 zone_init_free_lists(zone); 4341 zone_init_free_lists(zone);
4332 4342
4333 return 0; 4343 return 0;
4334 } 4344 }
4335 4345
4336 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4346 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4337 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4347 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4338 /* 4348 /*
4339 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4349 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4340 * Architectures may implement their own version but if add_active_range() 4350 * Architectures may implement their own version but if add_active_range()
4341 * was used and there are no special requirements, this is a convenient 4351 * was used and there are no special requirements, this is a convenient
4342 * alternative 4352 * alternative
4343 */ 4353 */
4344 int __meminit __early_pfn_to_nid(unsigned long pfn) 4354 int __meminit __early_pfn_to_nid(unsigned long pfn)
4345 { 4355 {
4346 unsigned long start_pfn, end_pfn; 4356 unsigned long start_pfn, end_pfn;
4347 int nid; 4357 int nid;
4348 /* 4358 /*
4349 * NOTE: The following SMP-unsafe globals are only used early in boot 4359 * NOTE: The following SMP-unsafe globals are only used early in boot
4350 * when the kernel is running single-threaded. 4360 * when the kernel is running single-threaded.
4351 */ 4361 */
4352 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4362 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4353 static int __meminitdata last_nid; 4363 static int __meminitdata last_nid;
4354 4364
4355 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4365 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4356 return last_nid; 4366 return last_nid;
4357 4367
4358 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4368 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4359 if (nid != -1) { 4369 if (nid != -1) {
4360 last_start_pfn = start_pfn; 4370 last_start_pfn = start_pfn;
4361 last_end_pfn = end_pfn; 4371 last_end_pfn = end_pfn;
4362 last_nid = nid; 4372 last_nid = nid;
4363 } 4373 }
4364 4374
4365 return nid; 4375 return nid;
4366 } 4376 }
4367 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4377 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4368 4378
4369 int __meminit early_pfn_to_nid(unsigned long pfn) 4379 int __meminit early_pfn_to_nid(unsigned long pfn)
4370 { 4380 {
4371 int nid; 4381 int nid;
4372 4382
4373 nid = __early_pfn_to_nid(pfn); 4383 nid = __early_pfn_to_nid(pfn);
4374 if (nid >= 0) 4384 if (nid >= 0)
4375 return nid; 4385 return nid;
4376 /* just returns 0 */ 4386 /* just returns 0 */
4377 return 0; 4387 return 0;
4378 } 4388 }
4379 4389
4380 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4390 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4381 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4391 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4382 { 4392 {
4383 int nid; 4393 int nid;
4384 4394
4385 nid = __early_pfn_to_nid(pfn); 4395 nid = __early_pfn_to_nid(pfn);
4386 if (nid >= 0 && nid != node) 4396 if (nid >= 0 && nid != node)
4387 return false; 4397 return false;
4388 return true; 4398 return true;
4389 } 4399 }
4390 #endif 4400 #endif
4391 4401
4392 /** 4402 /**
4393 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4403 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4394 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4404 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4395 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4405 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4396 * 4406 *
4397 * If an architecture guarantees that all ranges registered with 4407 * If an architecture guarantees that all ranges registered with
4398 * add_active_ranges() contain no holes and may be freed, this 4408 * add_active_ranges() contain no holes and may be freed, this
4399 * this function may be used instead of calling free_bootmem() manually. 4409 * this function may be used instead of calling free_bootmem() manually.
4400 */ 4410 */
4401 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4411 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4402 { 4412 {
4403 unsigned long start_pfn, end_pfn; 4413 unsigned long start_pfn, end_pfn;
4404 int i, this_nid; 4414 int i, this_nid;
4405 4415
4406 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4416 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4407 start_pfn = min(start_pfn, max_low_pfn); 4417 start_pfn = min(start_pfn, max_low_pfn);
4408 end_pfn = min(end_pfn, max_low_pfn); 4418 end_pfn = min(end_pfn, max_low_pfn);
4409 4419
4410 if (start_pfn < end_pfn) 4420 if (start_pfn < end_pfn)
4411 free_bootmem_node(NODE_DATA(this_nid), 4421 free_bootmem_node(NODE_DATA(this_nid),
4412 PFN_PHYS(start_pfn), 4422 PFN_PHYS(start_pfn),
4413 (end_pfn - start_pfn) << PAGE_SHIFT); 4423 (end_pfn - start_pfn) << PAGE_SHIFT);
4414 } 4424 }
4415 } 4425 }
4416 4426
4417 /** 4427 /**
4418 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4428 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4419 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4429 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4420 * 4430 *
4421 * If an architecture guarantees that all ranges registered with 4431 * If an architecture guarantees that all ranges registered with
4422 * add_active_ranges() contain no holes and may be freed, this 4432 * add_active_ranges() contain no holes and may be freed, this
4423 * function may be used instead of calling memory_present() manually. 4433 * function may be used instead of calling memory_present() manually.
4424 */ 4434 */
4425 void __init sparse_memory_present_with_active_regions(int nid) 4435 void __init sparse_memory_present_with_active_regions(int nid)
4426 { 4436 {
4427 unsigned long start_pfn, end_pfn; 4437 unsigned long start_pfn, end_pfn;
4428 int i, this_nid; 4438 int i, this_nid;
4429 4439
4430 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4440 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4431 memory_present(this_nid, start_pfn, end_pfn); 4441 memory_present(this_nid, start_pfn, end_pfn);
4432 } 4442 }
4433 4443
4434 /** 4444 /**
4435 * get_pfn_range_for_nid - Return the start and end page frames for a node 4445 * get_pfn_range_for_nid - Return the start and end page frames for a node
4436 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4446 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4437 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4447 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4438 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4448 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4439 * 4449 *
4440 * It returns the start and end page frame of a node based on information 4450 * It returns the start and end page frame of a node based on information
4441 * provided by an arch calling add_active_range(). If called for a node 4451 * provided by an arch calling add_active_range(). If called for a node
4442 * with no available memory, a warning is printed and the start and end 4452 * with no available memory, a warning is printed and the start and end
4443 * PFNs will be 0. 4453 * PFNs will be 0.
4444 */ 4454 */
4445 void __meminit get_pfn_range_for_nid(unsigned int nid, 4455 void __meminit get_pfn_range_for_nid(unsigned int nid,
4446 unsigned long *start_pfn, unsigned long *end_pfn) 4456 unsigned long *start_pfn, unsigned long *end_pfn)
4447 { 4457 {
4448 unsigned long this_start_pfn, this_end_pfn; 4458 unsigned long this_start_pfn, this_end_pfn;
4449 int i; 4459 int i;
4450 4460
4451 *start_pfn = -1UL; 4461 *start_pfn = -1UL;
4452 *end_pfn = 0; 4462 *end_pfn = 0;
4453 4463
4454 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4464 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4455 *start_pfn = min(*start_pfn, this_start_pfn); 4465 *start_pfn = min(*start_pfn, this_start_pfn);
4456 *end_pfn = max(*end_pfn, this_end_pfn); 4466 *end_pfn = max(*end_pfn, this_end_pfn);
4457 } 4467 }
4458 4468
4459 if (*start_pfn == -1UL) 4469 if (*start_pfn == -1UL)
4460 *start_pfn = 0; 4470 *start_pfn = 0;
4461 } 4471 }
4462 4472
4463 /* 4473 /*
4464 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4474 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4465 * assumption is made that zones within a node are ordered in monotonic 4475 * assumption is made that zones within a node are ordered in monotonic
4466 * increasing memory addresses so that the "highest" populated zone is used 4476 * increasing memory addresses so that the "highest" populated zone is used
4467 */ 4477 */
4468 static void __init find_usable_zone_for_movable(void) 4478 static void __init find_usable_zone_for_movable(void)
4469 { 4479 {
4470 int zone_index; 4480 int zone_index;
4471 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4481 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4472 if (zone_index == ZONE_MOVABLE) 4482 if (zone_index == ZONE_MOVABLE)
4473 continue; 4483 continue;
4474 4484
4475 if (arch_zone_highest_possible_pfn[zone_index] > 4485 if (arch_zone_highest_possible_pfn[zone_index] >
4476 arch_zone_lowest_possible_pfn[zone_index]) 4486 arch_zone_lowest_possible_pfn[zone_index])
4477 break; 4487 break;
4478 } 4488 }
4479 4489
4480 VM_BUG_ON(zone_index == -1); 4490 VM_BUG_ON(zone_index == -1);
4481 movable_zone = zone_index; 4491 movable_zone = zone_index;
4482 } 4492 }
4483 4493
4484 /* 4494 /*
4485 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4495 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4486 * because it is sized independent of architecture. Unlike the other zones, 4496 * because it is sized independent of architecture. Unlike the other zones,
4487 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4497 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4488 * in each node depending on the size of each node and how evenly kernelcore 4498 * in each node depending on the size of each node and how evenly kernelcore
4489 * is distributed. This helper function adjusts the zone ranges 4499 * is distributed. This helper function adjusts the zone ranges
4490 * provided by the architecture for a given node by using the end of the 4500 * provided by the architecture for a given node by using the end of the
4491 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4501 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4492 * zones within a node are in order of monotonic increases memory addresses 4502 * zones within a node are in order of monotonic increases memory addresses
4493 */ 4503 */
4494 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4504 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4495 unsigned long zone_type, 4505 unsigned long zone_type,
4496 unsigned long node_start_pfn, 4506 unsigned long node_start_pfn,
4497 unsigned long node_end_pfn, 4507 unsigned long node_end_pfn,
4498 unsigned long *zone_start_pfn, 4508 unsigned long *zone_start_pfn,
4499 unsigned long *zone_end_pfn) 4509 unsigned long *zone_end_pfn)
4500 { 4510 {
4501 /* Only adjust if ZONE_MOVABLE is on this node */ 4511 /* Only adjust if ZONE_MOVABLE is on this node */
4502 if (zone_movable_pfn[nid]) { 4512 if (zone_movable_pfn[nid]) {
4503 /* Size ZONE_MOVABLE */ 4513 /* Size ZONE_MOVABLE */
4504 if (zone_type == ZONE_MOVABLE) { 4514 if (zone_type == ZONE_MOVABLE) {
4505 *zone_start_pfn = zone_movable_pfn[nid]; 4515 *zone_start_pfn = zone_movable_pfn[nid];
4506 *zone_end_pfn = min(node_end_pfn, 4516 *zone_end_pfn = min(node_end_pfn,
4507 arch_zone_highest_possible_pfn[movable_zone]); 4517 arch_zone_highest_possible_pfn[movable_zone]);
4508 4518
4509 /* Adjust for ZONE_MOVABLE starting within this range */ 4519 /* Adjust for ZONE_MOVABLE starting within this range */
4510 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4520 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4511 *zone_end_pfn > zone_movable_pfn[nid]) { 4521 *zone_end_pfn > zone_movable_pfn[nid]) {
4512 *zone_end_pfn = zone_movable_pfn[nid]; 4522 *zone_end_pfn = zone_movable_pfn[nid];
4513 4523
4514 /* Check if this whole range is within ZONE_MOVABLE */ 4524 /* Check if this whole range is within ZONE_MOVABLE */
4515 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4525 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4516 *zone_start_pfn = *zone_end_pfn; 4526 *zone_start_pfn = *zone_end_pfn;
4517 } 4527 }
4518 } 4528 }
4519 4529
4520 /* 4530 /*
4521 * Return the number of pages a zone spans in a node, including holes 4531 * Return the number of pages a zone spans in a node, including holes
4522 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4532 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4523 */ 4533 */
4524 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4534 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4525 unsigned long zone_type, 4535 unsigned long zone_type,
4526 unsigned long node_start_pfn, 4536 unsigned long node_start_pfn,
4527 unsigned long node_end_pfn, 4537 unsigned long node_end_pfn,
4528 unsigned long *ignored) 4538 unsigned long *ignored)
4529 { 4539 {
4530 unsigned long zone_start_pfn, zone_end_pfn; 4540 unsigned long zone_start_pfn, zone_end_pfn;
4531 4541
4532 /* Get the start and end of the zone */ 4542 /* Get the start and end of the zone */
4533 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4543 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4534 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4544 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4535 adjust_zone_range_for_zone_movable(nid, zone_type, 4545 adjust_zone_range_for_zone_movable(nid, zone_type,
4536 node_start_pfn, node_end_pfn, 4546 node_start_pfn, node_end_pfn,
4537 &zone_start_pfn, &zone_end_pfn); 4547 &zone_start_pfn, &zone_end_pfn);
4538 4548
4539 /* Check that this node has pages within the zone's required range */ 4549 /* Check that this node has pages within the zone's required range */
4540 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4550 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4541 return 0; 4551 return 0;
4542 4552
4543 /* Move the zone boundaries inside the node if necessary */ 4553 /* Move the zone boundaries inside the node if necessary */
4544 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4554 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4545 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4555 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4546 4556
4547 /* Return the spanned pages */ 4557 /* Return the spanned pages */
4548 return zone_end_pfn - zone_start_pfn; 4558 return zone_end_pfn - zone_start_pfn;
4549 } 4559 }
4550 4560
4551 /* 4561 /*
4552 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4562 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4553 * then all holes in the requested range will be accounted for. 4563 * then all holes in the requested range will be accounted for.
4554 */ 4564 */
4555 unsigned long __meminit __absent_pages_in_range(int nid, 4565 unsigned long __meminit __absent_pages_in_range(int nid,
4556 unsigned long range_start_pfn, 4566 unsigned long range_start_pfn,
4557 unsigned long range_end_pfn) 4567 unsigned long range_end_pfn)
4558 { 4568 {
4559 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4569 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4560 unsigned long start_pfn, end_pfn; 4570 unsigned long start_pfn, end_pfn;
4561 int i; 4571 int i;
4562 4572
4563 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4573 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4564 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4574 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4565 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4575 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4566 nr_absent -= end_pfn - start_pfn; 4576 nr_absent -= end_pfn - start_pfn;
4567 } 4577 }
4568 return nr_absent; 4578 return nr_absent;
4569 } 4579 }
4570 4580
4571 /** 4581 /**
4572 * absent_pages_in_range - Return number of page frames in holes within a range 4582 * absent_pages_in_range - Return number of page frames in holes within a range
4573 * @start_pfn: The start PFN to start searching for holes 4583 * @start_pfn: The start PFN to start searching for holes
4574 * @end_pfn: The end PFN to stop searching for holes 4584 * @end_pfn: The end PFN to stop searching for holes
4575 * 4585 *
4576 * It returns the number of pages frames in memory holes within a range. 4586 * It returns the number of pages frames in memory holes within a range.
4577 */ 4587 */
4578 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4588 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4579 unsigned long end_pfn) 4589 unsigned long end_pfn)
4580 { 4590 {
4581 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4591 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4582 } 4592 }
4583 4593
4584 /* Return the number of page frames in holes in a zone on a node */ 4594 /* Return the number of page frames in holes in a zone on a node */
4585 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4595 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4586 unsigned long zone_type, 4596 unsigned long zone_type,
4587 unsigned long node_start_pfn, 4597 unsigned long node_start_pfn,
4588 unsigned long node_end_pfn, 4598 unsigned long node_end_pfn,
4589 unsigned long *ignored) 4599 unsigned long *ignored)
4590 { 4600 {
4591 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4601 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4592 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4602 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4593 unsigned long zone_start_pfn, zone_end_pfn; 4603 unsigned long zone_start_pfn, zone_end_pfn;
4594 4604
4595 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4605 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4596 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4606 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4597 4607
4598 adjust_zone_range_for_zone_movable(nid, zone_type, 4608 adjust_zone_range_for_zone_movable(nid, zone_type,
4599 node_start_pfn, node_end_pfn, 4609 node_start_pfn, node_end_pfn,
4600 &zone_start_pfn, &zone_end_pfn); 4610 &zone_start_pfn, &zone_end_pfn);
4601 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4611 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4602 } 4612 }
4603 4613
4604 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4614 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4605 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4615 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4606 unsigned long zone_type, 4616 unsigned long zone_type,
4607 unsigned long node_start_pfn, 4617 unsigned long node_start_pfn,
4608 unsigned long node_end_pfn, 4618 unsigned long node_end_pfn,
4609 unsigned long *zones_size) 4619 unsigned long *zones_size)
4610 { 4620 {
4611 return zones_size[zone_type]; 4621 return zones_size[zone_type];
4612 } 4622 }
4613 4623
4614 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4624 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4615 unsigned long zone_type, 4625 unsigned long zone_type,
4616 unsigned long node_start_pfn, 4626 unsigned long node_start_pfn,
4617 unsigned long node_end_pfn, 4627 unsigned long node_end_pfn,
4618 unsigned long *zholes_size) 4628 unsigned long *zholes_size)
4619 { 4629 {
4620 if (!zholes_size) 4630 if (!zholes_size)
4621 return 0; 4631 return 0;
4622 4632
4623 return zholes_size[zone_type]; 4633 return zholes_size[zone_type];
4624 } 4634 }
4625 4635
4626 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4636 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4627 4637
4628 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4638 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4629 unsigned long node_start_pfn, 4639 unsigned long node_start_pfn,
4630 unsigned long node_end_pfn, 4640 unsigned long node_end_pfn,
4631 unsigned long *zones_size, 4641 unsigned long *zones_size,
4632 unsigned long *zholes_size) 4642 unsigned long *zholes_size)
4633 { 4643 {
4634 unsigned long realtotalpages, totalpages = 0; 4644 unsigned long realtotalpages, totalpages = 0;
4635 enum zone_type i; 4645 enum zone_type i;
4636 4646
4637 for (i = 0; i < MAX_NR_ZONES; i++) 4647 for (i = 0; i < MAX_NR_ZONES; i++)
4638 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4648 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4639 node_start_pfn, 4649 node_start_pfn,
4640 node_end_pfn, 4650 node_end_pfn,
4641 zones_size); 4651 zones_size);
4642 pgdat->node_spanned_pages = totalpages; 4652 pgdat->node_spanned_pages = totalpages;
4643 4653
4644 realtotalpages = totalpages; 4654 realtotalpages = totalpages;
4645 for (i = 0; i < MAX_NR_ZONES; i++) 4655 for (i = 0; i < MAX_NR_ZONES; i++)
4646 realtotalpages -= 4656 realtotalpages -=
4647 zone_absent_pages_in_node(pgdat->node_id, i, 4657 zone_absent_pages_in_node(pgdat->node_id, i,
4648 node_start_pfn, node_end_pfn, 4658 node_start_pfn, node_end_pfn,
4649 zholes_size); 4659 zholes_size);
4650 pgdat->node_present_pages = realtotalpages; 4660 pgdat->node_present_pages = realtotalpages;
4651 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4661 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4652 realtotalpages); 4662 realtotalpages);
4653 } 4663 }
4654 4664
4655 #ifndef CONFIG_SPARSEMEM 4665 #ifndef CONFIG_SPARSEMEM
4656 /* 4666 /*
4657 * Calculate the size of the zone->blockflags rounded to an unsigned long 4667 * Calculate the size of the zone->blockflags rounded to an unsigned long
4658 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4668 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4659 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4669 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4660 * round what is now in bits to nearest long in bits, then return it in 4670 * round what is now in bits to nearest long in bits, then return it in
4661 * bytes. 4671 * bytes.
4662 */ 4672 */
4663 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4673 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4664 { 4674 {
4665 unsigned long usemapsize; 4675 unsigned long usemapsize;
4666 4676
4667 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4677 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4668 usemapsize = roundup(zonesize, pageblock_nr_pages); 4678 usemapsize = roundup(zonesize, pageblock_nr_pages);
4669 usemapsize = usemapsize >> pageblock_order; 4679 usemapsize = usemapsize >> pageblock_order;
4670 usemapsize *= NR_PAGEBLOCK_BITS; 4680 usemapsize *= NR_PAGEBLOCK_BITS;
4671 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4681 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4672 4682
4673 return usemapsize / 8; 4683 return usemapsize / 8;
4674 } 4684 }
4675 4685
4676 static void __init setup_usemap(struct pglist_data *pgdat, 4686 static void __init setup_usemap(struct pglist_data *pgdat,
4677 struct zone *zone, 4687 struct zone *zone,
4678 unsigned long zone_start_pfn, 4688 unsigned long zone_start_pfn,
4679 unsigned long zonesize) 4689 unsigned long zonesize)
4680 { 4690 {
4681 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4691 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4682 zone->pageblock_flags = NULL; 4692 zone->pageblock_flags = NULL;
4683 if (usemapsize) 4693 if (usemapsize)
4684 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4694 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4685 usemapsize); 4695 usemapsize);
4686 } 4696 }
4687 #else 4697 #else
4688 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4698 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4689 unsigned long zone_start_pfn, unsigned long zonesize) {} 4699 unsigned long zone_start_pfn, unsigned long zonesize) {}
4690 #endif /* CONFIG_SPARSEMEM */ 4700 #endif /* CONFIG_SPARSEMEM */
4691 4701
4692 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4702 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4693 4703
4694 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4704 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4695 void __paginginit set_pageblock_order(void) 4705 void __paginginit set_pageblock_order(void)
4696 { 4706 {
4697 unsigned int order; 4707 unsigned int order;
4698 4708
4699 /* Check that pageblock_nr_pages has not already been setup */ 4709 /* Check that pageblock_nr_pages has not already been setup */
4700 if (pageblock_order) 4710 if (pageblock_order)
4701 return; 4711 return;
4702 4712
4703 if (HPAGE_SHIFT > PAGE_SHIFT) 4713 if (HPAGE_SHIFT > PAGE_SHIFT)
4704 order = HUGETLB_PAGE_ORDER; 4714 order = HUGETLB_PAGE_ORDER;
4705 else 4715 else
4706 order = MAX_ORDER - 1; 4716 order = MAX_ORDER - 1;
4707 4717
4708 /* 4718 /*
4709 * Assume the largest contiguous order of interest is a huge page. 4719 * Assume the largest contiguous order of interest is a huge page.
4710 * This value may be variable depending on boot parameters on IA64 and 4720 * This value may be variable depending on boot parameters on IA64 and
4711 * powerpc. 4721 * powerpc.
4712 */ 4722 */
4713 pageblock_order = order; 4723 pageblock_order = order;
4714 } 4724 }
4715 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4725 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4716 4726
4717 /* 4727 /*
4718 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4728 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4719 * is unused as pageblock_order is set at compile-time. See 4729 * is unused as pageblock_order is set at compile-time. See
4720 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4730 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4721 * the kernel config 4731 * the kernel config
4722 */ 4732 */
4723 void __paginginit set_pageblock_order(void) 4733 void __paginginit set_pageblock_order(void)
4724 { 4734 {
4725 } 4735 }
4726 4736
4727 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4737 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4728 4738
4729 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4739 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4730 unsigned long present_pages) 4740 unsigned long present_pages)
4731 { 4741 {
4732 unsigned long pages = spanned_pages; 4742 unsigned long pages = spanned_pages;
4733 4743
4734 /* 4744 /*
4735 * Provide a more accurate estimation if there are holes within 4745 * Provide a more accurate estimation if there are holes within
4736 * the zone and SPARSEMEM is in use. If there are holes within the 4746 * the zone and SPARSEMEM is in use. If there are holes within the
4737 * zone, each populated memory region may cost us one or two extra 4747 * zone, each populated memory region may cost us one or two extra
4738 * memmap pages due to alignment because memmap pages for each 4748 * memmap pages due to alignment because memmap pages for each
4739 * populated regions may not naturally algined on page boundary. 4749 * populated regions may not naturally algined on page boundary.
4740 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4750 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4741 */ 4751 */
4742 if (spanned_pages > present_pages + (present_pages >> 4) && 4752 if (spanned_pages > present_pages + (present_pages >> 4) &&
4743 IS_ENABLED(CONFIG_SPARSEMEM)) 4753 IS_ENABLED(CONFIG_SPARSEMEM))
4744 pages = present_pages; 4754 pages = present_pages;
4745 4755
4746 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4756 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4747 } 4757 }
4748 4758
4749 /* 4759 /*
4750 * Set up the zone data structures: 4760 * Set up the zone data structures:
4751 * - mark all pages reserved 4761 * - mark all pages reserved
4752 * - mark all memory queues empty 4762 * - mark all memory queues empty
4753 * - clear the memory bitmaps 4763 * - clear the memory bitmaps
4754 * 4764 *
4755 * NOTE: pgdat should get zeroed by caller. 4765 * NOTE: pgdat should get zeroed by caller.
4756 */ 4766 */
4757 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4767 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4758 unsigned long node_start_pfn, unsigned long node_end_pfn, 4768 unsigned long node_start_pfn, unsigned long node_end_pfn,
4759 unsigned long *zones_size, unsigned long *zholes_size) 4769 unsigned long *zones_size, unsigned long *zholes_size)
4760 { 4770 {
4761 enum zone_type j; 4771 enum zone_type j;
4762 int nid = pgdat->node_id; 4772 int nid = pgdat->node_id;
4763 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4773 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4764 int ret; 4774 int ret;
4765 4775
4766 pgdat_resize_init(pgdat); 4776 pgdat_resize_init(pgdat);
4767 #ifdef CONFIG_NUMA_BALANCING 4777 #ifdef CONFIG_NUMA_BALANCING
4768 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4778 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4769 pgdat->numabalancing_migrate_nr_pages = 0; 4779 pgdat->numabalancing_migrate_nr_pages = 0;
4770 pgdat->numabalancing_migrate_next_window = jiffies; 4780 pgdat->numabalancing_migrate_next_window = jiffies;
4771 #endif 4781 #endif
4772 init_waitqueue_head(&pgdat->kswapd_wait); 4782 init_waitqueue_head(&pgdat->kswapd_wait);
4773 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4783 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4774 pgdat_page_cgroup_init(pgdat); 4784 pgdat_page_cgroup_init(pgdat);
4775 4785
4776 for (j = 0; j < MAX_NR_ZONES; j++) { 4786 for (j = 0; j < MAX_NR_ZONES; j++) {
4777 struct zone *zone = pgdat->node_zones + j; 4787 struct zone *zone = pgdat->node_zones + j;
4778 unsigned long size, realsize, freesize, memmap_pages; 4788 unsigned long size, realsize, freesize, memmap_pages;
4779 4789
4780 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4790 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4781 node_end_pfn, zones_size); 4791 node_end_pfn, zones_size);
4782 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4792 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4783 node_start_pfn, 4793 node_start_pfn,
4784 node_end_pfn, 4794 node_end_pfn,
4785 zholes_size); 4795 zholes_size);
4786 4796
4787 /* 4797 /*
4788 * Adjust freesize so that it accounts for how much memory 4798 * Adjust freesize so that it accounts for how much memory
4789 * is used by this zone for memmap. This affects the watermark 4799 * is used by this zone for memmap. This affects the watermark
4790 * and per-cpu initialisations 4800 * and per-cpu initialisations
4791 */ 4801 */
4792 memmap_pages = calc_memmap_size(size, realsize); 4802 memmap_pages = calc_memmap_size(size, realsize);
4793 if (freesize >= memmap_pages) { 4803 if (freesize >= memmap_pages) {
4794 freesize -= memmap_pages; 4804 freesize -= memmap_pages;
4795 if (memmap_pages) 4805 if (memmap_pages)
4796 printk(KERN_DEBUG 4806 printk(KERN_DEBUG
4797 " %s zone: %lu pages used for memmap\n", 4807 " %s zone: %lu pages used for memmap\n",
4798 zone_names[j], memmap_pages); 4808 zone_names[j], memmap_pages);
4799 } else 4809 } else
4800 printk(KERN_WARNING 4810 printk(KERN_WARNING
4801 " %s zone: %lu pages exceeds freesize %lu\n", 4811 " %s zone: %lu pages exceeds freesize %lu\n",
4802 zone_names[j], memmap_pages, freesize); 4812 zone_names[j], memmap_pages, freesize);
4803 4813
4804 /* Account for reserved pages */ 4814 /* Account for reserved pages */
4805 if (j == 0 && freesize > dma_reserve) { 4815 if (j == 0 && freesize > dma_reserve) {
4806 freesize -= dma_reserve; 4816 freesize -= dma_reserve;
4807 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4817 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4808 zone_names[0], dma_reserve); 4818 zone_names[0], dma_reserve);
4809 } 4819 }
4810 4820
4811 if (!is_highmem_idx(j)) 4821 if (!is_highmem_idx(j))
4812 nr_kernel_pages += freesize; 4822 nr_kernel_pages += freesize;
4813 /* Charge for highmem memmap if there are enough kernel pages */ 4823 /* Charge for highmem memmap if there are enough kernel pages */
4814 else if (nr_kernel_pages > memmap_pages * 2) 4824 else if (nr_kernel_pages > memmap_pages * 2)
4815 nr_kernel_pages -= memmap_pages; 4825 nr_kernel_pages -= memmap_pages;
4816 nr_all_pages += freesize; 4826 nr_all_pages += freesize;
4817 4827
4818 zone->spanned_pages = size; 4828 zone->spanned_pages = size;
4819 zone->present_pages = realsize; 4829 zone->present_pages = realsize;
4820 /* 4830 /*
4821 * Set an approximate value for lowmem here, it will be adjusted 4831 * Set an approximate value for lowmem here, it will be adjusted
4822 * when the bootmem allocator frees pages into the buddy system. 4832 * when the bootmem allocator frees pages into the buddy system.
4823 * And all highmem pages will be managed by the buddy system. 4833 * And all highmem pages will be managed by the buddy system.
4824 */ 4834 */
4825 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4835 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4826 #ifdef CONFIG_NUMA 4836 #ifdef CONFIG_NUMA
4827 zone->node = nid; 4837 zone->node = nid;
4828 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4838 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4829 / 100; 4839 / 100;
4830 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4840 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4831 #endif 4841 #endif
4832 zone->name = zone_names[j]; 4842 zone->name = zone_names[j];
4833 spin_lock_init(&zone->lock); 4843 spin_lock_init(&zone->lock);
4834 spin_lock_init(&zone->lru_lock); 4844 spin_lock_init(&zone->lru_lock);
4835 zone_seqlock_init(zone); 4845 zone_seqlock_init(zone);
4836 zone->zone_pgdat = pgdat; 4846 zone->zone_pgdat = pgdat;
4837 zone_pcp_init(zone); 4847 zone_pcp_init(zone);
4838 4848
4839 /* For bootup, initialized properly in watermark setup */ 4849 /* For bootup, initialized properly in watermark setup */
4840 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4850 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4841 4851
4842 lruvec_init(&zone->lruvec); 4852 lruvec_init(&zone->lruvec);
4843 if (!size) 4853 if (!size)
4844 continue; 4854 continue;
4845 4855
4846 set_pageblock_order(); 4856 set_pageblock_order();
4847 setup_usemap(pgdat, zone, zone_start_pfn, size); 4857 setup_usemap(pgdat, zone, zone_start_pfn, size);
4848 ret = init_currently_empty_zone(zone, zone_start_pfn, 4858 ret = init_currently_empty_zone(zone, zone_start_pfn,
4849 size, MEMMAP_EARLY); 4859 size, MEMMAP_EARLY);
4850 BUG_ON(ret); 4860 BUG_ON(ret);
4851 memmap_init(size, nid, j, zone_start_pfn); 4861 memmap_init(size, nid, j, zone_start_pfn);
4852 zone_start_pfn += size; 4862 zone_start_pfn += size;
4853 } 4863 }
4854 } 4864 }
4855 4865
4856 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4866 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4857 { 4867 {
4858 /* Skip empty nodes */ 4868 /* Skip empty nodes */
4859 if (!pgdat->node_spanned_pages) 4869 if (!pgdat->node_spanned_pages)
4860 return; 4870 return;
4861 4871
4862 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4872 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4863 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4873 /* ia64 gets its own node_mem_map, before this, without bootmem */
4864 if (!pgdat->node_mem_map) { 4874 if (!pgdat->node_mem_map) {
4865 unsigned long size, start, end; 4875 unsigned long size, start, end;
4866 struct page *map; 4876 struct page *map;
4867 4877
4868 /* 4878 /*
4869 * The zone's endpoints aren't required to be MAX_ORDER 4879 * The zone's endpoints aren't required to be MAX_ORDER
4870 * aligned but the node_mem_map endpoints must be in order 4880 * aligned but the node_mem_map endpoints must be in order
4871 * for the buddy allocator to function correctly. 4881 * for the buddy allocator to function correctly.
4872 */ 4882 */
4873 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4883 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4874 end = pgdat_end_pfn(pgdat); 4884 end = pgdat_end_pfn(pgdat);
4875 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4885 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4876 size = (end - start) * sizeof(struct page); 4886 size = (end - start) * sizeof(struct page);
4877 map = alloc_remap(pgdat->node_id, size); 4887 map = alloc_remap(pgdat->node_id, size);
4878 if (!map) 4888 if (!map)
4879 map = alloc_bootmem_node_nopanic(pgdat, size); 4889 map = alloc_bootmem_node_nopanic(pgdat, size);
4880 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4890 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4881 } 4891 }
4882 #ifndef CONFIG_NEED_MULTIPLE_NODES 4892 #ifndef CONFIG_NEED_MULTIPLE_NODES
4883 /* 4893 /*
4884 * With no DISCONTIG, the global mem_map is just set as node 0's 4894 * With no DISCONTIG, the global mem_map is just set as node 0's
4885 */ 4895 */
4886 if (pgdat == NODE_DATA(0)) { 4896 if (pgdat == NODE_DATA(0)) {
4887 mem_map = NODE_DATA(0)->node_mem_map; 4897 mem_map = NODE_DATA(0)->node_mem_map;
4888 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4898 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4889 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4899 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4890 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4900 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4891 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4901 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4892 } 4902 }
4893 #endif 4903 #endif
4894 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4904 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4895 } 4905 }
4896 4906
4897 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4907 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4898 unsigned long node_start_pfn, unsigned long *zholes_size) 4908 unsigned long node_start_pfn, unsigned long *zholes_size)
4899 { 4909 {
4900 pg_data_t *pgdat = NODE_DATA(nid); 4910 pg_data_t *pgdat = NODE_DATA(nid);
4901 unsigned long start_pfn = 0; 4911 unsigned long start_pfn = 0;
4902 unsigned long end_pfn = 0; 4912 unsigned long end_pfn = 0;
4903 4913
4904 /* pg_data_t should be reset to zero when it's allocated */ 4914 /* pg_data_t should be reset to zero when it's allocated */
4905 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4915 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4906 4916
4907 pgdat->node_id = nid; 4917 pgdat->node_id = nid;
4908 pgdat->node_start_pfn = node_start_pfn; 4918 pgdat->node_start_pfn = node_start_pfn;
4909 if (node_state(nid, N_MEMORY)) 4919 if (node_state(nid, N_MEMORY))
4910 init_zone_allows_reclaim(nid); 4920 init_zone_allows_reclaim(nid);
4911 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4921 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4912 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4922 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4913 #endif 4923 #endif
4914 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4924 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4915 zones_size, zholes_size); 4925 zones_size, zholes_size);
4916 4926
4917 alloc_node_mem_map(pgdat); 4927 alloc_node_mem_map(pgdat);
4918 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4928 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4919 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4929 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4920 nid, (unsigned long)pgdat, 4930 nid, (unsigned long)pgdat,
4921 (unsigned long)pgdat->node_mem_map); 4931 (unsigned long)pgdat->node_mem_map);
4922 #endif 4932 #endif
4923 4933
4924 free_area_init_core(pgdat, start_pfn, end_pfn, 4934 free_area_init_core(pgdat, start_pfn, end_pfn,
4925 zones_size, zholes_size); 4935 zones_size, zholes_size);
4926 } 4936 }
4927 4937
4928 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4938 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4929 4939
4930 #if MAX_NUMNODES > 1 4940 #if MAX_NUMNODES > 1
4931 /* 4941 /*
4932 * Figure out the number of possible node ids. 4942 * Figure out the number of possible node ids.
4933 */ 4943 */
4934 void __init setup_nr_node_ids(void) 4944 void __init setup_nr_node_ids(void)
4935 { 4945 {
4936 unsigned int node; 4946 unsigned int node;
4937 unsigned int highest = 0; 4947 unsigned int highest = 0;
4938 4948
4939 for_each_node_mask(node, node_possible_map) 4949 for_each_node_mask(node, node_possible_map)
4940 highest = node; 4950 highest = node;
4941 nr_node_ids = highest + 1; 4951 nr_node_ids = highest + 1;
4942 } 4952 }
4943 #endif 4953 #endif
4944 4954
4945 /** 4955 /**
4946 * node_map_pfn_alignment - determine the maximum internode alignment 4956 * node_map_pfn_alignment - determine the maximum internode alignment
4947 * 4957 *
4948 * This function should be called after node map is populated and sorted. 4958 * This function should be called after node map is populated and sorted.
4949 * It calculates the maximum power of two alignment which can distinguish 4959 * It calculates the maximum power of two alignment which can distinguish
4950 * all the nodes. 4960 * all the nodes.
4951 * 4961 *
4952 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4962 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4953 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4963 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4954 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4964 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4955 * shifted, 1GiB is enough and this function will indicate so. 4965 * shifted, 1GiB is enough and this function will indicate so.
4956 * 4966 *
4957 * This is used to test whether pfn -> nid mapping of the chosen memory 4967 * This is used to test whether pfn -> nid mapping of the chosen memory
4958 * model has fine enough granularity to avoid incorrect mapping for the 4968 * model has fine enough granularity to avoid incorrect mapping for the
4959 * populated node map. 4969 * populated node map.
4960 * 4970 *
4961 * Returns the determined alignment in pfn's. 0 if there is no alignment 4971 * Returns the determined alignment in pfn's. 0 if there is no alignment
4962 * requirement (single node). 4972 * requirement (single node).
4963 */ 4973 */
4964 unsigned long __init node_map_pfn_alignment(void) 4974 unsigned long __init node_map_pfn_alignment(void)
4965 { 4975 {
4966 unsigned long accl_mask = 0, last_end = 0; 4976 unsigned long accl_mask = 0, last_end = 0;
4967 unsigned long start, end, mask; 4977 unsigned long start, end, mask;
4968 int last_nid = -1; 4978 int last_nid = -1;
4969 int i, nid; 4979 int i, nid;
4970 4980
4971 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4981 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4972 if (!start || last_nid < 0 || last_nid == nid) { 4982 if (!start || last_nid < 0 || last_nid == nid) {
4973 last_nid = nid; 4983 last_nid = nid;
4974 last_end = end; 4984 last_end = end;
4975 continue; 4985 continue;
4976 } 4986 }
4977 4987
4978 /* 4988 /*
4979 * Start with a mask granular enough to pin-point to the 4989 * Start with a mask granular enough to pin-point to the
4980 * start pfn and tick off bits one-by-one until it becomes 4990 * start pfn and tick off bits one-by-one until it becomes
4981 * too coarse to separate the current node from the last. 4991 * too coarse to separate the current node from the last.
4982 */ 4992 */
4983 mask = ~((1 << __ffs(start)) - 1); 4993 mask = ~((1 << __ffs(start)) - 1);
4984 while (mask && last_end <= (start & (mask << 1))) 4994 while (mask && last_end <= (start & (mask << 1)))
4985 mask <<= 1; 4995 mask <<= 1;
4986 4996
4987 /* accumulate all internode masks */ 4997 /* accumulate all internode masks */
4988 accl_mask |= mask; 4998 accl_mask |= mask;
4989 } 4999 }
4990 5000
4991 /* convert mask to number of pages */ 5001 /* convert mask to number of pages */
4992 return ~accl_mask + 1; 5002 return ~accl_mask + 1;
4993 } 5003 }
4994 5004
4995 /* Find the lowest pfn for a node */ 5005 /* Find the lowest pfn for a node */
4996 static unsigned long __init find_min_pfn_for_node(int nid) 5006 static unsigned long __init find_min_pfn_for_node(int nid)
4997 { 5007 {
4998 unsigned long min_pfn = ULONG_MAX; 5008 unsigned long min_pfn = ULONG_MAX;
4999 unsigned long start_pfn; 5009 unsigned long start_pfn;
5000 int i; 5010 int i;
5001 5011
5002 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5012 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
5003 min_pfn = min(min_pfn, start_pfn); 5013 min_pfn = min(min_pfn, start_pfn);
5004 5014
5005 if (min_pfn == ULONG_MAX) { 5015 if (min_pfn == ULONG_MAX) {
5006 printk(KERN_WARNING 5016 printk(KERN_WARNING
5007 "Could not find start_pfn for node %d\n", nid); 5017 "Could not find start_pfn for node %d\n", nid);
5008 return 0; 5018 return 0;
5009 } 5019 }
5010 5020
5011 return min_pfn; 5021 return min_pfn;
5012 } 5022 }
5013 5023
5014 /** 5024 /**
5015 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5025 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5016 * 5026 *
5017 * It returns the minimum PFN based on information provided via 5027 * It returns the minimum PFN based on information provided via
5018 * add_active_range(). 5028 * add_active_range().
5019 */ 5029 */
5020 unsigned long __init find_min_pfn_with_active_regions(void) 5030 unsigned long __init find_min_pfn_with_active_regions(void)
5021 { 5031 {
5022 return find_min_pfn_for_node(MAX_NUMNODES); 5032 return find_min_pfn_for_node(MAX_NUMNODES);
5023 } 5033 }
5024 5034
5025 /* 5035 /*
5026 * early_calculate_totalpages() 5036 * early_calculate_totalpages()
5027 * Sum pages in active regions for movable zone. 5037 * Sum pages in active regions for movable zone.
5028 * Populate N_MEMORY for calculating usable_nodes. 5038 * Populate N_MEMORY for calculating usable_nodes.
5029 */ 5039 */
5030 static unsigned long __init early_calculate_totalpages(void) 5040 static unsigned long __init early_calculate_totalpages(void)
5031 { 5041 {
5032 unsigned long totalpages = 0; 5042 unsigned long totalpages = 0;
5033 unsigned long start_pfn, end_pfn; 5043 unsigned long start_pfn, end_pfn;
5034 int i, nid; 5044 int i, nid;
5035 5045
5036 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5046 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5037 unsigned long pages = end_pfn - start_pfn; 5047 unsigned long pages = end_pfn - start_pfn;
5038 5048
5039 totalpages += pages; 5049 totalpages += pages;
5040 if (pages) 5050 if (pages)
5041 node_set_state(nid, N_MEMORY); 5051 node_set_state(nid, N_MEMORY);
5042 } 5052 }
5043 return totalpages; 5053 return totalpages;
5044 } 5054 }
5045 5055
5046 /* 5056 /*
5047 * Find the PFN the Movable zone begins in each node. Kernel memory 5057 * Find the PFN the Movable zone begins in each node. Kernel memory
5048 * is spread evenly between nodes as long as the nodes have enough 5058 * is spread evenly between nodes as long as the nodes have enough
5049 * memory. When they don't, some nodes will have more kernelcore than 5059 * memory. When they don't, some nodes will have more kernelcore than
5050 * others 5060 * others
5051 */ 5061 */
5052 static void __init find_zone_movable_pfns_for_nodes(void) 5062 static void __init find_zone_movable_pfns_for_nodes(void)
5053 { 5063 {
5054 int i, nid; 5064 int i, nid;
5055 unsigned long usable_startpfn; 5065 unsigned long usable_startpfn;
5056 unsigned long kernelcore_node, kernelcore_remaining; 5066 unsigned long kernelcore_node, kernelcore_remaining;
5057 /* save the state before borrow the nodemask */ 5067 /* save the state before borrow the nodemask */
5058 nodemask_t saved_node_state = node_states[N_MEMORY]; 5068 nodemask_t saved_node_state = node_states[N_MEMORY];
5059 unsigned long totalpages = early_calculate_totalpages(); 5069 unsigned long totalpages = early_calculate_totalpages();
5060 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5070 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5061 5071
5062 /* 5072 /*
5063 * If movablecore was specified, calculate what size of 5073 * If movablecore was specified, calculate what size of
5064 * kernelcore that corresponds so that memory usable for 5074 * kernelcore that corresponds so that memory usable for
5065 * any allocation type is evenly spread. If both kernelcore 5075 * any allocation type is evenly spread. If both kernelcore
5066 * and movablecore are specified, then the value of kernelcore 5076 * and movablecore are specified, then the value of kernelcore
5067 * will be used for required_kernelcore if it's greater than 5077 * will be used for required_kernelcore if it's greater than
5068 * what movablecore would have allowed. 5078 * what movablecore would have allowed.
5069 */ 5079 */
5070 if (required_movablecore) { 5080 if (required_movablecore) {
5071 unsigned long corepages; 5081 unsigned long corepages;
5072 5082
5073 /* 5083 /*
5074 * Round-up so that ZONE_MOVABLE is at least as large as what 5084 * Round-up so that ZONE_MOVABLE is at least as large as what
5075 * was requested by the user 5085 * was requested by the user
5076 */ 5086 */
5077 required_movablecore = 5087 required_movablecore =
5078 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5088 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5079 corepages = totalpages - required_movablecore; 5089 corepages = totalpages - required_movablecore;
5080 5090
5081 required_kernelcore = max(required_kernelcore, corepages); 5091 required_kernelcore = max(required_kernelcore, corepages);
5082 } 5092 }
5083 5093
5084 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5094 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5085 if (!required_kernelcore) 5095 if (!required_kernelcore)
5086 goto out; 5096 goto out;
5087 5097
5088 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5098 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5089 find_usable_zone_for_movable(); 5099 find_usable_zone_for_movable();
5090 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5100 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5091 5101
5092 restart: 5102 restart:
5093 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5103 /* Spread kernelcore memory as evenly as possible throughout nodes */
5094 kernelcore_node = required_kernelcore / usable_nodes; 5104 kernelcore_node = required_kernelcore / usable_nodes;
5095 for_each_node_state(nid, N_MEMORY) { 5105 for_each_node_state(nid, N_MEMORY) {
5096 unsigned long start_pfn, end_pfn; 5106 unsigned long start_pfn, end_pfn;
5097 5107
5098 /* 5108 /*
5099 * Recalculate kernelcore_node if the division per node 5109 * Recalculate kernelcore_node if the division per node
5100 * now exceeds what is necessary to satisfy the requested 5110 * now exceeds what is necessary to satisfy the requested
5101 * amount of memory for the kernel 5111 * amount of memory for the kernel
5102 */ 5112 */
5103 if (required_kernelcore < kernelcore_node) 5113 if (required_kernelcore < kernelcore_node)
5104 kernelcore_node = required_kernelcore / usable_nodes; 5114 kernelcore_node = required_kernelcore / usable_nodes;
5105 5115
5106 /* 5116 /*
5107 * As the map is walked, we track how much memory is usable 5117 * As the map is walked, we track how much memory is usable
5108 * by the kernel using kernelcore_remaining. When it is 5118 * by the kernel using kernelcore_remaining. When it is
5109 * 0, the rest of the node is usable by ZONE_MOVABLE 5119 * 0, the rest of the node is usable by ZONE_MOVABLE
5110 */ 5120 */
5111 kernelcore_remaining = kernelcore_node; 5121 kernelcore_remaining = kernelcore_node;
5112 5122
5113 /* Go through each range of PFNs within this node */ 5123 /* Go through each range of PFNs within this node */
5114 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5124 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5115 unsigned long size_pages; 5125 unsigned long size_pages;
5116 5126
5117 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5127 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5118 if (start_pfn >= end_pfn) 5128 if (start_pfn >= end_pfn)
5119 continue; 5129 continue;
5120 5130
5121 /* Account for what is only usable for kernelcore */ 5131 /* Account for what is only usable for kernelcore */
5122 if (start_pfn < usable_startpfn) { 5132 if (start_pfn < usable_startpfn) {
5123 unsigned long kernel_pages; 5133 unsigned long kernel_pages;
5124 kernel_pages = min(end_pfn, usable_startpfn) 5134 kernel_pages = min(end_pfn, usable_startpfn)
5125 - start_pfn; 5135 - start_pfn;
5126 5136
5127 kernelcore_remaining -= min(kernel_pages, 5137 kernelcore_remaining -= min(kernel_pages,
5128 kernelcore_remaining); 5138 kernelcore_remaining);
5129 required_kernelcore -= min(kernel_pages, 5139 required_kernelcore -= min(kernel_pages,
5130 required_kernelcore); 5140 required_kernelcore);
5131 5141
5132 /* Continue if range is now fully accounted */ 5142 /* Continue if range is now fully accounted */
5133 if (end_pfn <= usable_startpfn) { 5143 if (end_pfn <= usable_startpfn) {
5134 5144
5135 /* 5145 /*
5136 * Push zone_movable_pfn to the end so 5146 * Push zone_movable_pfn to the end so
5137 * that if we have to rebalance 5147 * that if we have to rebalance
5138 * kernelcore across nodes, we will 5148 * kernelcore across nodes, we will
5139 * not double account here 5149 * not double account here
5140 */ 5150 */
5141 zone_movable_pfn[nid] = end_pfn; 5151 zone_movable_pfn[nid] = end_pfn;
5142 continue; 5152 continue;
5143 } 5153 }
5144 start_pfn = usable_startpfn; 5154 start_pfn = usable_startpfn;
5145 } 5155 }
5146 5156
5147 /* 5157 /*
5148 * The usable PFN range for ZONE_MOVABLE is from 5158 * The usable PFN range for ZONE_MOVABLE is from
5149 * start_pfn->end_pfn. Calculate size_pages as the 5159 * start_pfn->end_pfn. Calculate size_pages as the
5150 * number of pages used as kernelcore 5160 * number of pages used as kernelcore
5151 */ 5161 */
5152 size_pages = end_pfn - start_pfn; 5162 size_pages = end_pfn - start_pfn;
5153 if (size_pages > kernelcore_remaining) 5163 if (size_pages > kernelcore_remaining)
5154 size_pages = kernelcore_remaining; 5164 size_pages = kernelcore_remaining;
5155 zone_movable_pfn[nid] = start_pfn + size_pages; 5165 zone_movable_pfn[nid] = start_pfn + size_pages;
5156 5166
5157 /* 5167 /*
5158 * Some kernelcore has been met, update counts and 5168 * Some kernelcore has been met, update counts and
5159 * break if the kernelcore for this node has been 5169 * break if the kernelcore for this node has been
5160 * satisfied 5170 * satisfied
5161 */ 5171 */
5162 required_kernelcore -= min(required_kernelcore, 5172 required_kernelcore -= min(required_kernelcore,
5163 size_pages); 5173 size_pages);
5164 kernelcore_remaining -= size_pages; 5174 kernelcore_remaining -= size_pages;
5165 if (!kernelcore_remaining) 5175 if (!kernelcore_remaining)
5166 break; 5176 break;
5167 } 5177 }
5168 } 5178 }
5169 5179
5170 /* 5180 /*
5171 * If there is still required_kernelcore, we do another pass with one 5181 * If there is still required_kernelcore, we do another pass with one
5172 * less node in the count. This will push zone_movable_pfn[nid] further 5182 * less node in the count. This will push zone_movable_pfn[nid] further
5173 * along on the nodes that still have memory until kernelcore is 5183 * along on the nodes that still have memory until kernelcore is
5174 * satisfied 5184 * satisfied
5175 */ 5185 */
5176 usable_nodes--; 5186 usable_nodes--;
5177 if (usable_nodes && required_kernelcore > usable_nodes) 5187 if (usable_nodes && required_kernelcore > usable_nodes)
5178 goto restart; 5188 goto restart;
5179 5189
5180 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5190 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5181 for (nid = 0; nid < MAX_NUMNODES; nid++) 5191 for (nid = 0; nid < MAX_NUMNODES; nid++)
5182 zone_movable_pfn[nid] = 5192 zone_movable_pfn[nid] =
5183 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5193 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5184 5194
5185 out: 5195 out:
5186 /* restore the node_state */ 5196 /* restore the node_state */
5187 node_states[N_MEMORY] = saved_node_state; 5197 node_states[N_MEMORY] = saved_node_state;
5188 } 5198 }
5189 5199
5190 /* Any regular or high memory on that node ? */ 5200 /* Any regular or high memory on that node ? */
5191 static void check_for_memory(pg_data_t *pgdat, int nid) 5201 static void check_for_memory(pg_data_t *pgdat, int nid)
5192 { 5202 {
5193 enum zone_type zone_type; 5203 enum zone_type zone_type;
5194 5204
5195 if (N_MEMORY == N_NORMAL_MEMORY) 5205 if (N_MEMORY == N_NORMAL_MEMORY)
5196 return; 5206 return;
5197 5207
5198 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5208 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5199 struct zone *zone = &pgdat->node_zones[zone_type]; 5209 struct zone *zone = &pgdat->node_zones[zone_type];
5200 if (zone->present_pages) { 5210 if (zone->present_pages) {
5201 node_set_state(nid, N_HIGH_MEMORY); 5211 node_set_state(nid, N_HIGH_MEMORY);
5202 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5212 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5203 zone_type <= ZONE_NORMAL) 5213 zone_type <= ZONE_NORMAL)
5204 node_set_state(nid, N_NORMAL_MEMORY); 5214 node_set_state(nid, N_NORMAL_MEMORY);
5205 break; 5215 break;
5206 } 5216 }
5207 } 5217 }
5208 } 5218 }
5209 5219
5210 /** 5220 /**
5211 * free_area_init_nodes - Initialise all pg_data_t and zone data 5221 * free_area_init_nodes - Initialise all pg_data_t and zone data
5212 * @max_zone_pfn: an array of max PFNs for each zone 5222 * @max_zone_pfn: an array of max PFNs for each zone
5213 * 5223 *
5214 * This will call free_area_init_node() for each active node in the system. 5224 * This will call free_area_init_node() for each active node in the system.
5215 * Using the page ranges provided by add_active_range(), the size of each 5225 * Using the page ranges provided by add_active_range(), the size of each
5216 * zone in each node and their holes is calculated. If the maximum PFN 5226 * zone in each node and their holes is calculated. If the maximum PFN
5217 * between two adjacent zones match, it is assumed that the zone is empty. 5227 * between two adjacent zones match, it is assumed that the zone is empty.
5218 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5228 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5219 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5229 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5220 * starts where the previous one ended. For example, ZONE_DMA32 starts 5230 * starts where the previous one ended. For example, ZONE_DMA32 starts
5221 * at arch_max_dma_pfn. 5231 * at arch_max_dma_pfn.
5222 */ 5232 */
5223 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5233 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5224 { 5234 {
5225 unsigned long start_pfn, end_pfn; 5235 unsigned long start_pfn, end_pfn;
5226 int i, nid; 5236 int i, nid;
5227 5237
5228 /* Record where the zone boundaries are */ 5238 /* Record where the zone boundaries are */
5229 memset(arch_zone_lowest_possible_pfn, 0, 5239 memset(arch_zone_lowest_possible_pfn, 0,
5230 sizeof(arch_zone_lowest_possible_pfn)); 5240 sizeof(arch_zone_lowest_possible_pfn));
5231 memset(arch_zone_highest_possible_pfn, 0, 5241 memset(arch_zone_highest_possible_pfn, 0,
5232 sizeof(arch_zone_highest_possible_pfn)); 5242 sizeof(arch_zone_highest_possible_pfn));
5233 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5243 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5234 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5244 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5235 for (i = 1; i < MAX_NR_ZONES; i++) { 5245 for (i = 1; i < MAX_NR_ZONES; i++) {
5236 if (i == ZONE_MOVABLE) 5246 if (i == ZONE_MOVABLE)
5237 continue; 5247 continue;
5238 arch_zone_lowest_possible_pfn[i] = 5248 arch_zone_lowest_possible_pfn[i] =
5239 arch_zone_highest_possible_pfn[i-1]; 5249 arch_zone_highest_possible_pfn[i-1];
5240 arch_zone_highest_possible_pfn[i] = 5250 arch_zone_highest_possible_pfn[i] =
5241 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5251 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5242 } 5252 }
5243 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5253 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5244 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5254 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5245 5255
5246 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5256 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5247 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5257 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5248 find_zone_movable_pfns_for_nodes(); 5258 find_zone_movable_pfns_for_nodes();
5249 5259
5250 /* Print out the zone ranges */ 5260 /* Print out the zone ranges */
5251 printk("Zone ranges:\n"); 5261 printk("Zone ranges:\n");
5252 for (i = 0; i < MAX_NR_ZONES; i++) { 5262 for (i = 0; i < MAX_NR_ZONES; i++) {
5253 if (i == ZONE_MOVABLE) 5263 if (i == ZONE_MOVABLE)
5254 continue; 5264 continue;
5255 printk(KERN_CONT " %-8s ", zone_names[i]); 5265 printk(KERN_CONT " %-8s ", zone_names[i]);
5256 if (arch_zone_lowest_possible_pfn[i] == 5266 if (arch_zone_lowest_possible_pfn[i] ==
5257 arch_zone_highest_possible_pfn[i]) 5267 arch_zone_highest_possible_pfn[i])
5258 printk(KERN_CONT "empty\n"); 5268 printk(KERN_CONT "empty\n");
5259 else 5269 else
5260 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5270 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5261 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5271 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5262 (arch_zone_highest_possible_pfn[i] 5272 (arch_zone_highest_possible_pfn[i]
5263 << PAGE_SHIFT) - 1); 5273 << PAGE_SHIFT) - 1);
5264 } 5274 }
5265 5275
5266 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5276 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5267 printk("Movable zone start for each node\n"); 5277 printk("Movable zone start for each node\n");
5268 for (i = 0; i < MAX_NUMNODES; i++) { 5278 for (i = 0; i < MAX_NUMNODES; i++) {
5269 if (zone_movable_pfn[i]) 5279 if (zone_movable_pfn[i])
5270 printk(" Node %d: %#010lx\n", i, 5280 printk(" Node %d: %#010lx\n", i,
5271 zone_movable_pfn[i] << PAGE_SHIFT); 5281 zone_movable_pfn[i] << PAGE_SHIFT);
5272 } 5282 }
5273 5283
5274 /* Print out the early node map */ 5284 /* Print out the early node map */
5275 printk("Early memory node ranges\n"); 5285 printk("Early memory node ranges\n");
5276 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5286 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5277 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5287 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5278 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5288 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5279 5289
5280 /* Initialise every node */ 5290 /* Initialise every node */
5281 mminit_verify_pageflags_layout(); 5291 mminit_verify_pageflags_layout();
5282 setup_nr_node_ids(); 5292 setup_nr_node_ids();
5283 for_each_online_node(nid) { 5293 for_each_online_node(nid) {
5284 pg_data_t *pgdat = NODE_DATA(nid); 5294 pg_data_t *pgdat = NODE_DATA(nid);
5285 free_area_init_node(nid, NULL, 5295 free_area_init_node(nid, NULL,
5286 find_min_pfn_for_node(nid), NULL); 5296 find_min_pfn_for_node(nid), NULL);
5287 5297
5288 /* Any memory on that node */ 5298 /* Any memory on that node */
5289 if (pgdat->node_present_pages) 5299 if (pgdat->node_present_pages)
5290 node_set_state(nid, N_MEMORY); 5300 node_set_state(nid, N_MEMORY);
5291 check_for_memory(pgdat, nid); 5301 check_for_memory(pgdat, nid);
5292 } 5302 }
5293 } 5303 }
5294 5304
5295 static int __init cmdline_parse_core(char *p, unsigned long *core) 5305 static int __init cmdline_parse_core(char *p, unsigned long *core)
5296 { 5306 {
5297 unsigned long long coremem; 5307 unsigned long long coremem;
5298 if (!p) 5308 if (!p)
5299 return -EINVAL; 5309 return -EINVAL;
5300 5310
5301 coremem = memparse(p, &p); 5311 coremem = memparse(p, &p);
5302 *core = coremem >> PAGE_SHIFT; 5312 *core = coremem >> PAGE_SHIFT;
5303 5313
5304 /* Paranoid check that UL is enough for the coremem value */ 5314 /* Paranoid check that UL is enough for the coremem value */
5305 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5315 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5306 5316
5307 return 0; 5317 return 0;
5308 } 5318 }
5309 5319
5310 /* 5320 /*
5311 * kernelcore=size sets the amount of memory for use for allocations that 5321 * kernelcore=size sets the amount of memory for use for allocations that
5312 * cannot be reclaimed or migrated. 5322 * cannot be reclaimed or migrated.
5313 */ 5323 */
5314 static int __init cmdline_parse_kernelcore(char *p) 5324 static int __init cmdline_parse_kernelcore(char *p)
5315 { 5325 {
5316 return cmdline_parse_core(p, &required_kernelcore); 5326 return cmdline_parse_core(p, &required_kernelcore);
5317 } 5327 }
5318 5328
5319 /* 5329 /*
5320 * movablecore=size sets the amount of memory for use for allocations that 5330 * movablecore=size sets the amount of memory for use for allocations that
5321 * can be reclaimed or migrated. 5331 * can be reclaimed or migrated.
5322 */ 5332 */
5323 static int __init cmdline_parse_movablecore(char *p) 5333 static int __init cmdline_parse_movablecore(char *p)
5324 { 5334 {
5325 return cmdline_parse_core(p, &required_movablecore); 5335 return cmdline_parse_core(p, &required_movablecore);
5326 } 5336 }
5327 5337
5328 early_param("kernelcore", cmdline_parse_kernelcore); 5338 early_param("kernelcore", cmdline_parse_kernelcore);
5329 early_param("movablecore", cmdline_parse_movablecore); 5339 early_param("movablecore", cmdline_parse_movablecore);
5330 5340
5331 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5341 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5332 5342
5333 void adjust_managed_page_count(struct page *page, long count) 5343 void adjust_managed_page_count(struct page *page, long count)
5334 { 5344 {
5335 spin_lock(&managed_page_count_lock); 5345 spin_lock(&managed_page_count_lock);
5336 page_zone(page)->managed_pages += count; 5346 page_zone(page)->managed_pages += count;
5337 totalram_pages += count; 5347 totalram_pages += count;
5338 #ifdef CONFIG_HIGHMEM 5348 #ifdef CONFIG_HIGHMEM
5339 if (PageHighMem(page)) 5349 if (PageHighMem(page))
5340 totalhigh_pages += count; 5350 totalhigh_pages += count;
5341 #endif 5351 #endif
5342 spin_unlock(&managed_page_count_lock); 5352 spin_unlock(&managed_page_count_lock);
5343 } 5353 }
5344 EXPORT_SYMBOL(adjust_managed_page_count); 5354 EXPORT_SYMBOL(adjust_managed_page_count);
5345 5355
5346 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5356 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5347 { 5357 {
5348 void *pos; 5358 void *pos;
5349 unsigned long pages = 0; 5359 unsigned long pages = 0;
5350 5360
5351 start = (void *)PAGE_ALIGN((unsigned long)start); 5361 start = (void *)PAGE_ALIGN((unsigned long)start);
5352 end = (void *)((unsigned long)end & PAGE_MASK); 5362 end = (void *)((unsigned long)end & PAGE_MASK);
5353 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5363 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5354 if ((unsigned int)poison <= 0xFF) 5364 if ((unsigned int)poison <= 0xFF)
5355 memset(pos, poison, PAGE_SIZE); 5365 memset(pos, poison, PAGE_SIZE);
5356 free_reserved_page(virt_to_page(pos)); 5366 free_reserved_page(virt_to_page(pos));
5357 } 5367 }
5358 5368
5359 if (pages && s) 5369 if (pages && s)
5360 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5370 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5361 s, pages << (PAGE_SHIFT - 10), start, end); 5371 s, pages << (PAGE_SHIFT - 10), start, end);
5362 5372
5363 return pages; 5373 return pages;
5364 } 5374 }
5365 EXPORT_SYMBOL(free_reserved_area); 5375 EXPORT_SYMBOL(free_reserved_area);
5366 5376
5367 #ifdef CONFIG_HIGHMEM 5377 #ifdef CONFIG_HIGHMEM
5368 void free_highmem_page(struct page *page) 5378 void free_highmem_page(struct page *page)
5369 { 5379 {
5370 __free_reserved_page(page); 5380 __free_reserved_page(page);
5371 totalram_pages++; 5381 totalram_pages++;
5372 page_zone(page)->managed_pages++; 5382 page_zone(page)->managed_pages++;
5373 totalhigh_pages++; 5383 totalhigh_pages++;
5374 } 5384 }
5375 #endif 5385 #endif
5376 5386
5377 5387
5378 void __init mem_init_print_info(const char *str) 5388 void __init mem_init_print_info(const char *str)
5379 { 5389 {
5380 unsigned long physpages, codesize, datasize, rosize, bss_size; 5390 unsigned long physpages, codesize, datasize, rosize, bss_size;
5381 unsigned long init_code_size, init_data_size; 5391 unsigned long init_code_size, init_data_size;
5382 5392
5383 physpages = get_num_physpages(); 5393 physpages = get_num_physpages();
5384 codesize = _etext - _stext; 5394 codesize = _etext - _stext;
5385 datasize = _edata - _sdata; 5395 datasize = _edata - _sdata;
5386 rosize = __end_rodata - __start_rodata; 5396 rosize = __end_rodata - __start_rodata;
5387 bss_size = __bss_stop - __bss_start; 5397 bss_size = __bss_stop - __bss_start;
5388 init_data_size = __init_end - __init_begin; 5398 init_data_size = __init_end - __init_begin;
5389 init_code_size = _einittext - _sinittext; 5399 init_code_size = _einittext - _sinittext;
5390 5400
5391 /* 5401 /*
5392 * Detect special cases and adjust section sizes accordingly: 5402 * Detect special cases and adjust section sizes accordingly:
5393 * 1) .init.* may be embedded into .data sections 5403 * 1) .init.* may be embedded into .data sections
5394 * 2) .init.text.* may be out of [__init_begin, __init_end], 5404 * 2) .init.text.* may be out of [__init_begin, __init_end],
5395 * please refer to arch/tile/kernel/vmlinux.lds.S. 5405 * please refer to arch/tile/kernel/vmlinux.lds.S.
5396 * 3) .rodata.* may be embedded into .text or .data sections. 5406 * 3) .rodata.* may be embedded into .text or .data sections.
5397 */ 5407 */
5398 #define adj_init_size(start, end, size, pos, adj) \ 5408 #define adj_init_size(start, end, size, pos, adj) \
5399 do { \ 5409 do { \
5400 if (start <= pos && pos < end && size > adj) \ 5410 if (start <= pos && pos < end && size > adj) \
5401 size -= adj; \ 5411 size -= adj; \
5402 } while (0) 5412 } while (0)
5403 5413
5404 adj_init_size(__init_begin, __init_end, init_data_size, 5414 adj_init_size(__init_begin, __init_end, init_data_size,
5405 _sinittext, init_code_size); 5415 _sinittext, init_code_size);
5406 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5416 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5407 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5417 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5408 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5418 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5409 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5419 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5410 5420
5411 #undef adj_init_size 5421 #undef adj_init_size
5412 5422
5413 printk("Memory: %luK/%luK available " 5423 printk("Memory: %luK/%luK available "
5414 "(%luK kernel code, %luK rwdata, %luK rodata, " 5424 "(%luK kernel code, %luK rwdata, %luK rodata, "
5415 "%luK init, %luK bss, %luK reserved" 5425 "%luK init, %luK bss, %luK reserved"
5416 #ifdef CONFIG_HIGHMEM 5426 #ifdef CONFIG_HIGHMEM
5417 ", %luK highmem" 5427 ", %luK highmem"
5418 #endif 5428 #endif
5419 "%s%s)\n", 5429 "%s%s)\n",
5420 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5430 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5421 codesize >> 10, datasize >> 10, rosize >> 10, 5431 codesize >> 10, datasize >> 10, rosize >> 10,
5422 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5432 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5423 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5433 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5424 #ifdef CONFIG_HIGHMEM 5434 #ifdef CONFIG_HIGHMEM
5425 totalhigh_pages << (PAGE_SHIFT-10), 5435 totalhigh_pages << (PAGE_SHIFT-10),
5426 #endif 5436 #endif
5427 str ? ", " : "", str ? str : ""); 5437 str ? ", " : "", str ? str : "");
5428 } 5438 }
5429 5439
5430 /** 5440 /**
5431 * set_dma_reserve - set the specified number of pages reserved in the first zone 5441 * set_dma_reserve - set the specified number of pages reserved in the first zone
5432 * @new_dma_reserve: The number of pages to mark reserved 5442 * @new_dma_reserve: The number of pages to mark reserved
5433 * 5443 *
5434 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5444 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5435 * In the DMA zone, a significant percentage may be consumed by kernel image 5445 * In the DMA zone, a significant percentage may be consumed by kernel image
5436 * and other unfreeable allocations which can skew the watermarks badly. This 5446 * and other unfreeable allocations which can skew the watermarks badly. This
5437 * function may optionally be used to account for unfreeable pages in the 5447 * function may optionally be used to account for unfreeable pages in the
5438 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5448 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5439 * smaller per-cpu batchsize. 5449 * smaller per-cpu batchsize.
5440 */ 5450 */
5441 void __init set_dma_reserve(unsigned long new_dma_reserve) 5451 void __init set_dma_reserve(unsigned long new_dma_reserve)
5442 { 5452 {
5443 dma_reserve = new_dma_reserve; 5453 dma_reserve = new_dma_reserve;
5444 } 5454 }
5445 5455
5446 void __init free_area_init(unsigned long *zones_size) 5456 void __init free_area_init(unsigned long *zones_size)
5447 { 5457 {
5448 free_area_init_node(0, zones_size, 5458 free_area_init_node(0, zones_size,
5449 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5459 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5450 } 5460 }
5451 5461
5452 static int page_alloc_cpu_notify(struct notifier_block *self, 5462 static int page_alloc_cpu_notify(struct notifier_block *self,
5453 unsigned long action, void *hcpu) 5463 unsigned long action, void *hcpu)
5454 { 5464 {
5455 int cpu = (unsigned long)hcpu; 5465 int cpu = (unsigned long)hcpu;
5456 5466
5457 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5467 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5458 lru_add_drain_cpu(cpu); 5468 lru_add_drain_cpu(cpu);
5459 drain_pages(cpu); 5469 drain_pages(cpu);
5460 5470
5461 /* 5471 /*
5462 * Spill the event counters of the dead processor 5472 * Spill the event counters of the dead processor
5463 * into the current processors event counters. 5473 * into the current processors event counters.
5464 * This artificially elevates the count of the current 5474 * This artificially elevates the count of the current
5465 * processor. 5475 * processor.
5466 */ 5476 */
5467 vm_events_fold_cpu(cpu); 5477 vm_events_fold_cpu(cpu);
5468 5478
5469 /* 5479 /*
5470 * Zero the differential counters of the dead processor 5480 * Zero the differential counters of the dead processor
5471 * so that the vm statistics are consistent. 5481 * so that the vm statistics are consistent.
5472 * 5482 *
5473 * This is only okay since the processor is dead and cannot 5483 * This is only okay since the processor is dead and cannot
5474 * race with what we are doing. 5484 * race with what we are doing.
5475 */ 5485 */
5476 cpu_vm_stats_fold(cpu); 5486 cpu_vm_stats_fold(cpu);
5477 } 5487 }
5478 return NOTIFY_OK; 5488 return NOTIFY_OK;
5479 } 5489 }
5480 5490
5481 void __init page_alloc_init(void) 5491 void __init page_alloc_init(void)
5482 { 5492 {
5483 hotcpu_notifier(page_alloc_cpu_notify, 0); 5493 hotcpu_notifier(page_alloc_cpu_notify, 0);
5484 } 5494 }
5485 5495
5486 /* 5496 /*
5487 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5497 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5488 * or min_free_kbytes changes. 5498 * or min_free_kbytes changes.
5489 */ 5499 */
5490 static void calculate_totalreserve_pages(void) 5500 static void calculate_totalreserve_pages(void)
5491 { 5501 {
5492 struct pglist_data *pgdat; 5502 struct pglist_data *pgdat;
5493 unsigned long reserve_pages = 0; 5503 unsigned long reserve_pages = 0;
5494 enum zone_type i, j; 5504 enum zone_type i, j;
5495 5505
5496 for_each_online_pgdat(pgdat) { 5506 for_each_online_pgdat(pgdat) {
5497 for (i = 0; i < MAX_NR_ZONES; i++) { 5507 for (i = 0; i < MAX_NR_ZONES; i++) {
5498 struct zone *zone = pgdat->node_zones + i; 5508 struct zone *zone = pgdat->node_zones + i;
5499 unsigned long max = 0; 5509 unsigned long max = 0;
5500 5510
5501 /* Find valid and maximum lowmem_reserve in the zone */ 5511 /* Find valid and maximum lowmem_reserve in the zone */
5502 for (j = i; j < MAX_NR_ZONES; j++) { 5512 for (j = i; j < MAX_NR_ZONES; j++) {
5503 if (zone->lowmem_reserve[j] > max) 5513 if (zone->lowmem_reserve[j] > max)
5504 max = zone->lowmem_reserve[j]; 5514 max = zone->lowmem_reserve[j];
5505 } 5515 }
5506 5516
5507 /* we treat the high watermark as reserved pages. */ 5517 /* we treat the high watermark as reserved pages. */
5508 max += high_wmark_pages(zone); 5518 max += high_wmark_pages(zone);
5509 5519
5510 if (max > zone->managed_pages) 5520 if (max > zone->managed_pages)
5511 max = zone->managed_pages; 5521 max = zone->managed_pages;
5512 reserve_pages += max; 5522 reserve_pages += max;
5513 /* 5523 /*
5514 * Lowmem reserves are not available to 5524 * Lowmem reserves are not available to
5515 * GFP_HIGHUSER page cache allocations and 5525 * GFP_HIGHUSER page cache allocations and
5516 * kswapd tries to balance zones to their high 5526 * kswapd tries to balance zones to their high
5517 * watermark. As a result, neither should be 5527 * watermark. As a result, neither should be
5518 * regarded as dirtyable memory, to prevent a 5528 * regarded as dirtyable memory, to prevent a
5519 * situation where reclaim has to clean pages 5529 * situation where reclaim has to clean pages
5520 * in order to balance the zones. 5530 * in order to balance the zones.
5521 */ 5531 */
5522 zone->dirty_balance_reserve = max; 5532 zone->dirty_balance_reserve = max;
5523 } 5533 }
5524 } 5534 }
5525 dirty_balance_reserve = reserve_pages; 5535 dirty_balance_reserve = reserve_pages;
5526 totalreserve_pages = reserve_pages; 5536 totalreserve_pages = reserve_pages;
5527 } 5537 }
5528 5538
5529 /* 5539 /*
5530 * setup_per_zone_lowmem_reserve - called whenever 5540 * setup_per_zone_lowmem_reserve - called whenever
5531 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5541 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5532 * has a correct pages reserved value, so an adequate number of 5542 * has a correct pages reserved value, so an adequate number of
5533 * pages are left in the zone after a successful __alloc_pages(). 5543 * pages are left in the zone after a successful __alloc_pages().
5534 */ 5544 */
5535 static void setup_per_zone_lowmem_reserve(void) 5545 static void setup_per_zone_lowmem_reserve(void)
5536 { 5546 {
5537 struct pglist_data *pgdat; 5547 struct pglist_data *pgdat;
5538 enum zone_type j, idx; 5548 enum zone_type j, idx;
5539 5549
5540 for_each_online_pgdat(pgdat) { 5550 for_each_online_pgdat(pgdat) {
5541 for (j = 0; j < MAX_NR_ZONES; j++) { 5551 for (j = 0; j < MAX_NR_ZONES; j++) {
5542 struct zone *zone = pgdat->node_zones + j; 5552 struct zone *zone = pgdat->node_zones + j;
5543 unsigned long managed_pages = zone->managed_pages; 5553 unsigned long managed_pages = zone->managed_pages;
5544 5554
5545 zone->lowmem_reserve[j] = 0; 5555 zone->lowmem_reserve[j] = 0;
5546 5556
5547 idx = j; 5557 idx = j;
5548 while (idx) { 5558 while (idx) {
5549 struct zone *lower_zone; 5559 struct zone *lower_zone;
5550 5560
5551 idx--; 5561 idx--;
5552 5562
5553 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5563 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5554 sysctl_lowmem_reserve_ratio[idx] = 1; 5564 sysctl_lowmem_reserve_ratio[idx] = 1;
5555 5565
5556 lower_zone = pgdat->node_zones + idx; 5566 lower_zone = pgdat->node_zones + idx;
5557 lower_zone->lowmem_reserve[j] = managed_pages / 5567 lower_zone->lowmem_reserve[j] = managed_pages /
5558 sysctl_lowmem_reserve_ratio[idx]; 5568 sysctl_lowmem_reserve_ratio[idx];
5559 managed_pages += lower_zone->managed_pages; 5569 managed_pages += lower_zone->managed_pages;
5560 } 5570 }
5561 } 5571 }
5562 } 5572 }
5563 5573
5564 /* update totalreserve_pages */ 5574 /* update totalreserve_pages */
5565 calculate_totalreserve_pages(); 5575 calculate_totalreserve_pages();
5566 } 5576 }
5567 5577
5568 static void __setup_per_zone_wmarks(void) 5578 static void __setup_per_zone_wmarks(void)
5569 { 5579 {
5570 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5580 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5571 unsigned long lowmem_pages = 0; 5581 unsigned long lowmem_pages = 0;
5572 struct zone *zone; 5582 struct zone *zone;
5573 unsigned long flags; 5583 unsigned long flags;
5574 5584
5575 /* Calculate total number of !ZONE_HIGHMEM pages */ 5585 /* Calculate total number of !ZONE_HIGHMEM pages */
5576 for_each_zone(zone) { 5586 for_each_zone(zone) {
5577 if (!is_highmem(zone)) 5587 if (!is_highmem(zone))
5578 lowmem_pages += zone->managed_pages; 5588 lowmem_pages += zone->managed_pages;
5579 } 5589 }
5580 5590
5581 for_each_zone(zone) { 5591 for_each_zone(zone) {
5582 u64 tmp; 5592 u64 tmp;
5583 5593
5584 spin_lock_irqsave(&zone->lock, flags); 5594 spin_lock_irqsave(&zone->lock, flags);
5585 tmp = (u64)pages_min * zone->managed_pages; 5595 tmp = (u64)pages_min * zone->managed_pages;
5586 do_div(tmp, lowmem_pages); 5596 do_div(tmp, lowmem_pages);
5587 if (is_highmem(zone)) { 5597 if (is_highmem(zone)) {
5588 /* 5598 /*
5589 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5599 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5590 * need highmem pages, so cap pages_min to a small 5600 * need highmem pages, so cap pages_min to a small
5591 * value here. 5601 * value here.
5592 * 5602 *
5593 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5603 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5594 * deltas controls asynch page reclaim, and so should 5604 * deltas controls asynch page reclaim, and so should
5595 * not be capped for highmem. 5605 * not be capped for highmem.
5596 */ 5606 */
5597 unsigned long min_pages; 5607 unsigned long min_pages;
5598 5608
5599 min_pages = zone->managed_pages / 1024; 5609 min_pages = zone->managed_pages / 1024;
5600 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5610 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5601 zone->watermark[WMARK_MIN] = min_pages; 5611 zone->watermark[WMARK_MIN] = min_pages;
5602 } else { 5612 } else {
5603 /* 5613 /*
5604 * If it's a lowmem zone, reserve a number of pages 5614 * If it's a lowmem zone, reserve a number of pages
5605 * proportionate to the zone's size. 5615 * proportionate to the zone's size.
5606 */ 5616 */
5607 zone->watermark[WMARK_MIN] = tmp; 5617 zone->watermark[WMARK_MIN] = tmp;
5608 } 5618 }
5609 5619
5610 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5620 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5611 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5621 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5612 5622
5613 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5623 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5614 high_wmark_pages(zone) - 5624 high_wmark_pages(zone) -
5615 low_wmark_pages(zone) - 5625 low_wmark_pages(zone) -
5616 zone_page_state(zone, NR_ALLOC_BATCH)); 5626 zone_page_state(zone, NR_ALLOC_BATCH));
5617 5627
5618 setup_zone_migrate_reserve(zone); 5628 setup_zone_migrate_reserve(zone);
5619 spin_unlock_irqrestore(&zone->lock, flags); 5629 spin_unlock_irqrestore(&zone->lock, flags);
5620 } 5630 }
5621 5631
5622 /* update totalreserve_pages */ 5632 /* update totalreserve_pages */
5623 calculate_totalreserve_pages(); 5633 calculate_totalreserve_pages();
5624 } 5634 }
5625 5635
5626 /** 5636 /**
5627 * setup_per_zone_wmarks - called when min_free_kbytes changes 5637 * setup_per_zone_wmarks - called when min_free_kbytes changes
5628 * or when memory is hot-{added|removed} 5638 * or when memory is hot-{added|removed}
5629 * 5639 *
5630 * Ensures that the watermark[min,low,high] values for each zone are set 5640 * Ensures that the watermark[min,low,high] values for each zone are set
5631 * correctly with respect to min_free_kbytes. 5641 * correctly with respect to min_free_kbytes.
5632 */ 5642 */
5633 void setup_per_zone_wmarks(void) 5643 void setup_per_zone_wmarks(void)
5634 { 5644 {
5635 mutex_lock(&zonelists_mutex); 5645 mutex_lock(&zonelists_mutex);
5636 __setup_per_zone_wmarks(); 5646 __setup_per_zone_wmarks();
5637 mutex_unlock(&zonelists_mutex); 5647 mutex_unlock(&zonelists_mutex);
5638 } 5648 }
5639 5649
5640 /* 5650 /*
5641 * The inactive anon list should be small enough that the VM never has to 5651 * The inactive anon list should be small enough that the VM never has to
5642 * do too much work, but large enough that each inactive page has a chance 5652 * do too much work, but large enough that each inactive page has a chance
5643 * to be referenced again before it is swapped out. 5653 * to be referenced again before it is swapped out.
5644 * 5654 *
5645 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5655 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5646 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5656 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5647 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5657 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5648 * the anonymous pages are kept on the inactive list. 5658 * the anonymous pages are kept on the inactive list.
5649 * 5659 *
5650 * total target max 5660 * total target max
5651 * memory ratio inactive anon 5661 * memory ratio inactive anon
5652 * ------------------------------------- 5662 * -------------------------------------
5653 * 10MB 1 5MB 5663 * 10MB 1 5MB
5654 * 100MB 1 50MB 5664 * 100MB 1 50MB
5655 * 1GB 3 250MB 5665 * 1GB 3 250MB
5656 * 10GB 10 0.9GB 5666 * 10GB 10 0.9GB
5657 * 100GB 31 3GB 5667 * 100GB 31 3GB
5658 * 1TB 101 10GB 5668 * 1TB 101 10GB
5659 * 10TB 320 32GB 5669 * 10TB 320 32GB
5660 */ 5670 */
5661 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5671 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5662 { 5672 {
5663 unsigned int gb, ratio; 5673 unsigned int gb, ratio;
5664 5674
5665 /* Zone size in gigabytes */ 5675 /* Zone size in gigabytes */
5666 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5676 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5667 if (gb) 5677 if (gb)
5668 ratio = int_sqrt(10 * gb); 5678 ratio = int_sqrt(10 * gb);
5669 else 5679 else
5670 ratio = 1; 5680 ratio = 1;
5671 5681
5672 zone->inactive_ratio = ratio; 5682 zone->inactive_ratio = ratio;
5673 } 5683 }
5674 5684
5675 static void __meminit setup_per_zone_inactive_ratio(void) 5685 static void __meminit setup_per_zone_inactive_ratio(void)
5676 { 5686 {
5677 struct zone *zone; 5687 struct zone *zone;
5678 5688
5679 for_each_zone(zone) 5689 for_each_zone(zone)
5680 calculate_zone_inactive_ratio(zone); 5690 calculate_zone_inactive_ratio(zone);
5681 } 5691 }
5682 5692
5683 /* 5693 /*
5684 * Initialise min_free_kbytes. 5694 * Initialise min_free_kbytes.
5685 * 5695 *
5686 * For small machines we want it small (128k min). For large machines 5696 * For small machines we want it small (128k min). For large machines
5687 * we want it large (64MB max). But it is not linear, because network 5697 * we want it large (64MB max). But it is not linear, because network
5688 * bandwidth does not increase linearly with machine size. We use 5698 * bandwidth does not increase linearly with machine size. We use
5689 * 5699 *
5690 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5700 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5691 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5701 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5692 * 5702 *
5693 * which yields 5703 * which yields
5694 * 5704 *
5695 * 16MB: 512k 5705 * 16MB: 512k
5696 * 32MB: 724k 5706 * 32MB: 724k
5697 * 64MB: 1024k 5707 * 64MB: 1024k
5698 * 128MB: 1448k 5708 * 128MB: 1448k
5699 * 256MB: 2048k 5709 * 256MB: 2048k
5700 * 512MB: 2896k 5710 * 512MB: 2896k
5701 * 1024MB: 4096k 5711 * 1024MB: 4096k
5702 * 2048MB: 5792k 5712 * 2048MB: 5792k
5703 * 4096MB: 8192k 5713 * 4096MB: 8192k
5704 * 8192MB: 11584k 5714 * 8192MB: 11584k
5705 * 16384MB: 16384k 5715 * 16384MB: 16384k
5706 */ 5716 */
5707 int __meminit init_per_zone_wmark_min(void) 5717 int __meminit init_per_zone_wmark_min(void)
5708 { 5718 {
5709 unsigned long lowmem_kbytes; 5719 unsigned long lowmem_kbytes;
5710 int new_min_free_kbytes; 5720 int new_min_free_kbytes;
5711 5721
5712 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5722 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5713 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5723 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5714 5724
5715 if (new_min_free_kbytes > user_min_free_kbytes) { 5725 if (new_min_free_kbytes > user_min_free_kbytes) {
5716 min_free_kbytes = new_min_free_kbytes; 5726 min_free_kbytes = new_min_free_kbytes;
5717 if (min_free_kbytes < 128) 5727 if (min_free_kbytes < 128)
5718 min_free_kbytes = 128; 5728 min_free_kbytes = 128;
5719 if (min_free_kbytes > 65536) 5729 if (min_free_kbytes > 65536)
5720 min_free_kbytes = 65536; 5730 min_free_kbytes = 65536;
5721 } else { 5731 } else {
5722 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5732 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5723 new_min_free_kbytes, user_min_free_kbytes); 5733 new_min_free_kbytes, user_min_free_kbytes);
5724 } 5734 }
5725 setup_per_zone_wmarks(); 5735 setup_per_zone_wmarks();
5726 refresh_zone_stat_thresholds(); 5736 refresh_zone_stat_thresholds();
5727 setup_per_zone_lowmem_reserve(); 5737 setup_per_zone_lowmem_reserve();
5728 setup_per_zone_inactive_ratio(); 5738 setup_per_zone_inactive_ratio();
5729 return 0; 5739 return 0;
5730 } 5740 }
5731 module_init(init_per_zone_wmark_min) 5741 module_init(init_per_zone_wmark_min)
5732 5742
5733 /* 5743 /*
5734 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5744 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5735 * that we can call two helper functions whenever min_free_kbytes 5745 * that we can call two helper functions whenever min_free_kbytes
5736 * changes. 5746 * changes.
5737 */ 5747 */
5738 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5748 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5739 void __user *buffer, size_t *length, loff_t *ppos) 5749 void __user *buffer, size_t *length, loff_t *ppos)
5740 { 5750 {
5741 int rc; 5751 int rc;
5742 5752
5743 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5753 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5744 if (rc) 5754 if (rc)
5745 return rc; 5755 return rc;
5746 5756
5747 if (write) { 5757 if (write) {
5748 user_min_free_kbytes = min_free_kbytes; 5758 user_min_free_kbytes = min_free_kbytes;
5749 setup_per_zone_wmarks(); 5759 setup_per_zone_wmarks();
5750 } 5760 }
5751 return 0; 5761 return 0;
5752 } 5762 }
5753 5763
5754 #ifdef CONFIG_NUMA 5764 #ifdef CONFIG_NUMA
5755 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5765 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5756 void __user *buffer, size_t *length, loff_t *ppos) 5766 void __user *buffer, size_t *length, loff_t *ppos)
5757 { 5767 {
5758 struct zone *zone; 5768 struct zone *zone;
5759 int rc; 5769 int rc;
5760 5770
5761 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5771 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5762 if (rc) 5772 if (rc)
5763 return rc; 5773 return rc;
5764 5774
5765 for_each_zone(zone) 5775 for_each_zone(zone)
5766 zone->min_unmapped_pages = (zone->managed_pages * 5776 zone->min_unmapped_pages = (zone->managed_pages *
5767 sysctl_min_unmapped_ratio) / 100; 5777 sysctl_min_unmapped_ratio) / 100;
5768 return 0; 5778 return 0;
5769 } 5779 }
5770 5780
5771 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5781 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5772 void __user *buffer, size_t *length, loff_t *ppos) 5782 void __user *buffer, size_t *length, loff_t *ppos)
5773 { 5783 {
5774 struct zone *zone; 5784 struct zone *zone;
5775 int rc; 5785 int rc;
5776 5786
5777 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5787 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5778 if (rc) 5788 if (rc)
5779 return rc; 5789 return rc;
5780 5790
5781 for_each_zone(zone) 5791 for_each_zone(zone)
5782 zone->min_slab_pages = (zone->managed_pages * 5792 zone->min_slab_pages = (zone->managed_pages *
5783 sysctl_min_slab_ratio) / 100; 5793 sysctl_min_slab_ratio) / 100;
5784 return 0; 5794 return 0;
5785 } 5795 }
5786 #endif 5796 #endif
5787 5797
5788 /* 5798 /*
5789 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5799 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5790 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5800 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5791 * whenever sysctl_lowmem_reserve_ratio changes. 5801 * whenever sysctl_lowmem_reserve_ratio changes.
5792 * 5802 *
5793 * The reserve ratio obviously has absolutely no relation with the 5803 * The reserve ratio obviously has absolutely no relation with the
5794 * minimum watermarks. The lowmem reserve ratio can only make sense 5804 * minimum watermarks. The lowmem reserve ratio can only make sense
5795 * if in function of the boot time zone sizes. 5805 * if in function of the boot time zone sizes.
5796 */ 5806 */
5797 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5807 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5798 void __user *buffer, size_t *length, loff_t *ppos) 5808 void __user *buffer, size_t *length, loff_t *ppos)
5799 { 5809 {
5800 proc_dointvec_minmax(table, write, buffer, length, ppos); 5810 proc_dointvec_minmax(table, write, buffer, length, ppos);
5801 setup_per_zone_lowmem_reserve(); 5811 setup_per_zone_lowmem_reserve();
5802 return 0; 5812 return 0;
5803 } 5813 }
5804 5814
5805 /* 5815 /*
5806 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5816 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5807 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5817 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5808 * pagelist can have before it gets flushed back to buddy allocator. 5818 * pagelist can have before it gets flushed back to buddy allocator.
5809 */ 5819 */
5810 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5820 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5811 void __user *buffer, size_t *length, loff_t *ppos) 5821 void __user *buffer, size_t *length, loff_t *ppos)
5812 { 5822 {
5813 struct zone *zone; 5823 struct zone *zone;
5814 int old_percpu_pagelist_fraction; 5824 int old_percpu_pagelist_fraction;
5815 int ret; 5825 int ret;
5816 5826
5817 mutex_lock(&pcp_batch_high_lock); 5827 mutex_lock(&pcp_batch_high_lock);
5818 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 5828 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5819 5829
5820 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5830 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5821 if (!write || ret < 0) 5831 if (!write || ret < 0)
5822 goto out; 5832 goto out;
5823 5833
5824 /* Sanity checking to avoid pcp imbalance */ 5834 /* Sanity checking to avoid pcp imbalance */
5825 if (percpu_pagelist_fraction && 5835 if (percpu_pagelist_fraction &&
5826 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 5836 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5827 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 5837 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5828 ret = -EINVAL; 5838 ret = -EINVAL;
5829 goto out; 5839 goto out;
5830 } 5840 }
5831 5841
5832 /* No change? */ 5842 /* No change? */
5833 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 5843 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5834 goto out; 5844 goto out;
5835 5845
5836 for_each_populated_zone(zone) { 5846 for_each_populated_zone(zone) {
5837 unsigned int cpu; 5847 unsigned int cpu;
5838 5848
5839 for_each_possible_cpu(cpu) 5849 for_each_possible_cpu(cpu)
5840 pageset_set_high_and_batch(zone, 5850 pageset_set_high_and_batch(zone,
5841 per_cpu_ptr(zone->pageset, cpu)); 5851 per_cpu_ptr(zone->pageset, cpu));
5842 } 5852 }
5843 out: 5853 out:
5844 mutex_unlock(&pcp_batch_high_lock); 5854 mutex_unlock(&pcp_batch_high_lock);
5845 return ret; 5855 return ret;
5846 } 5856 }
5847 5857
5848 int hashdist = HASHDIST_DEFAULT; 5858 int hashdist = HASHDIST_DEFAULT;
5849 5859
5850 #ifdef CONFIG_NUMA 5860 #ifdef CONFIG_NUMA
5851 static int __init set_hashdist(char *str) 5861 static int __init set_hashdist(char *str)
5852 { 5862 {
5853 if (!str) 5863 if (!str)
5854 return 0; 5864 return 0;
5855 hashdist = simple_strtoul(str, &str, 0); 5865 hashdist = simple_strtoul(str, &str, 0);
5856 return 1; 5866 return 1;
5857 } 5867 }
5858 __setup("hashdist=", set_hashdist); 5868 __setup("hashdist=", set_hashdist);
5859 #endif 5869 #endif
5860 5870
5861 /* 5871 /*
5862 * allocate a large system hash table from bootmem 5872 * allocate a large system hash table from bootmem
5863 * - it is assumed that the hash table must contain an exact power-of-2 5873 * - it is assumed that the hash table must contain an exact power-of-2
5864 * quantity of entries 5874 * quantity of entries
5865 * - limit is the number of hash buckets, not the total allocation size 5875 * - limit is the number of hash buckets, not the total allocation size
5866 */ 5876 */
5867 void *__init alloc_large_system_hash(const char *tablename, 5877 void *__init alloc_large_system_hash(const char *tablename,
5868 unsigned long bucketsize, 5878 unsigned long bucketsize,
5869 unsigned long numentries, 5879 unsigned long numentries,
5870 int scale, 5880 int scale,
5871 int flags, 5881 int flags,
5872 unsigned int *_hash_shift, 5882 unsigned int *_hash_shift,
5873 unsigned int *_hash_mask, 5883 unsigned int *_hash_mask,
5874 unsigned long low_limit, 5884 unsigned long low_limit,
5875 unsigned long high_limit) 5885 unsigned long high_limit)
5876 { 5886 {
5877 unsigned long long max = high_limit; 5887 unsigned long long max = high_limit;
5878 unsigned long log2qty, size; 5888 unsigned long log2qty, size;
5879 void *table = NULL; 5889 void *table = NULL;
5880 5890
5881 /* allow the kernel cmdline to have a say */ 5891 /* allow the kernel cmdline to have a say */
5882 if (!numentries) { 5892 if (!numentries) {
5883 /* round applicable memory size up to nearest megabyte */ 5893 /* round applicable memory size up to nearest megabyte */
5884 numentries = nr_kernel_pages; 5894 numentries = nr_kernel_pages;
5885 5895
5886 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5896 /* It isn't necessary when PAGE_SIZE >= 1MB */
5887 if (PAGE_SHIFT < 20) 5897 if (PAGE_SHIFT < 20)
5888 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5898 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5889 5899
5890 /* limit to 1 bucket per 2^scale bytes of low memory */ 5900 /* limit to 1 bucket per 2^scale bytes of low memory */
5891 if (scale > PAGE_SHIFT) 5901 if (scale > PAGE_SHIFT)
5892 numentries >>= (scale - PAGE_SHIFT); 5902 numentries >>= (scale - PAGE_SHIFT);
5893 else 5903 else
5894 numentries <<= (PAGE_SHIFT - scale); 5904 numentries <<= (PAGE_SHIFT - scale);
5895 5905
5896 /* Make sure we've got at least a 0-order allocation.. */ 5906 /* Make sure we've got at least a 0-order allocation.. */
5897 if (unlikely(flags & HASH_SMALL)) { 5907 if (unlikely(flags & HASH_SMALL)) {
5898 /* Makes no sense without HASH_EARLY */ 5908 /* Makes no sense without HASH_EARLY */
5899 WARN_ON(!(flags & HASH_EARLY)); 5909 WARN_ON(!(flags & HASH_EARLY));
5900 if (!(numentries >> *_hash_shift)) { 5910 if (!(numentries >> *_hash_shift)) {
5901 numentries = 1UL << *_hash_shift; 5911 numentries = 1UL << *_hash_shift;
5902 BUG_ON(!numentries); 5912 BUG_ON(!numentries);
5903 } 5913 }
5904 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5914 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5905 numentries = PAGE_SIZE / bucketsize; 5915 numentries = PAGE_SIZE / bucketsize;
5906 } 5916 }
5907 numentries = roundup_pow_of_two(numentries); 5917 numentries = roundup_pow_of_two(numentries);
5908 5918
5909 /* limit allocation size to 1/16 total memory by default */ 5919 /* limit allocation size to 1/16 total memory by default */
5910 if (max == 0) { 5920 if (max == 0) {
5911 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5921 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5912 do_div(max, bucketsize); 5922 do_div(max, bucketsize);
5913 } 5923 }
5914 max = min(max, 0x80000000ULL); 5924 max = min(max, 0x80000000ULL);
5915 5925
5916 if (numentries < low_limit) 5926 if (numentries < low_limit)
5917 numentries = low_limit; 5927 numentries = low_limit;
5918 if (numentries > max) 5928 if (numentries > max)
5919 numentries = max; 5929 numentries = max;
5920 5930
5921 log2qty = ilog2(numentries); 5931 log2qty = ilog2(numentries);
5922 5932
5923 do { 5933 do {
5924 size = bucketsize << log2qty; 5934 size = bucketsize << log2qty;
5925 if (flags & HASH_EARLY) 5935 if (flags & HASH_EARLY)
5926 table = alloc_bootmem_nopanic(size); 5936 table = alloc_bootmem_nopanic(size);
5927 else if (hashdist) 5937 else if (hashdist)
5928 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5938 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5929 else { 5939 else {
5930 /* 5940 /*
5931 * If bucketsize is not a power-of-two, we may free 5941 * If bucketsize is not a power-of-two, we may free
5932 * some pages at the end of hash table which 5942 * some pages at the end of hash table which
5933 * alloc_pages_exact() automatically does 5943 * alloc_pages_exact() automatically does
5934 */ 5944 */
5935 if (get_order(size) < MAX_ORDER) { 5945 if (get_order(size) < MAX_ORDER) {
5936 table = alloc_pages_exact(size, GFP_ATOMIC); 5946 table = alloc_pages_exact(size, GFP_ATOMIC);
5937 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5947 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5938 } 5948 }
5939 } 5949 }
5940 } while (!table && size > PAGE_SIZE && --log2qty); 5950 } while (!table && size > PAGE_SIZE && --log2qty);
5941 5951
5942 if (!table) 5952 if (!table)
5943 panic("Failed to allocate %s hash table\n", tablename); 5953 panic("Failed to allocate %s hash table\n", tablename);
5944 5954
5945 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5955 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5946 tablename, 5956 tablename,
5947 (1UL << log2qty), 5957 (1UL << log2qty),
5948 ilog2(size) - PAGE_SHIFT, 5958 ilog2(size) - PAGE_SHIFT,
5949 size); 5959 size);
5950 5960
5951 if (_hash_shift) 5961 if (_hash_shift)
5952 *_hash_shift = log2qty; 5962 *_hash_shift = log2qty;
5953 if (_hash_mask) 5963 if (_hash_mask)
5954 *_hash_mask = (1 << log2qty) - 1; 5964 *_hash_mask = (1 << log2qty) - 1;
5955 5965
5956 return table; 5966 return table;
5957 } 5967 }
5958 5968
5959 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5969 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5960 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5970 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5961 unsigned long pfn) 5971 unsigned long pfn)
5962 { 5972 {
5963 #ifdef CONFIG_SPARSEMEM 5973 #ifdef CONFIG_SPARSEMEM
5964 return __pfn_to_section(pfn)->pageblock_flags; 5974 return __pfn_to_section(pfn)->pageblock_flags;
5965 #else 5975 #else
5966 return zone->pageblock_flags; 5976 return zone->pageblock_flags;
5967 #endif /* CONFIG_SPARSEMEM */ 5977 #endif /* CONFIG_SPARSEMEM */
5968 } 5978 }
5969 5979
5970 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5980 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5971 { 5981 {
5972 #ifdef CONFIG_SPARSEMEM 5982 #ifdef CONFIG_SPARSEMEM
5973 pfn &= (PAGES_PER_SECTION-1); 5983 pfn &= (PAGES_PER_SECTION-1);
5974 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5984 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5975 #else 5985 #else
5976 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 5986 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5977 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5987 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5978 #endif /* CONFIG_SPARSEMEM */ 5988 #endif /* CONFIG_SPARSEMEM */
5979 } 5989 }
5980 5990
5981 /** 5991 /**
5982 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5992 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5983 * @page: The page within the block of interest 5993 * @page: The page within the block of interest
5984 * @start_bitidx: The first bit of interest to retrieve 5994 * @start_bitidx: The first bit of interest to retrieve
5985 * @end_bitidx: The last bit of interest 5995 * @end_bitidx: The last bit of interest
5986 * returns pageblock_bits flags 5996 * returns pageblock_bits flags
5987 */ 5997 */
5988 unsigned long get_pageblock_flags_mask(struct page *page, 5998 unsigned long get_pageblock_flags_mask(struct page *page,
5989 unsigned long end_bitidx, 5999 unsigned long end_bitidx,
5990 unsigned long mask) 6000 unsigned long mask)
5991 { 6001 {
5992 struct zone *zone; 6002 struct zone *zone;
5993 unsigned long *bitmap; 6003 unsigned long *bitmap;
5994 unsigned long pfn, bitidx, word_bitidx; 6004 unsigned long pfn, bitidx, word_bitidx;
5995 unsigned long word; 6005 unsigned long word;
5996 6006
5997 zone = page_zone(page); 6007 zone = page_zone(page);
5998 pfn = page_to_pfn(page); 6008 pfn = page_to_pfn(page);
5999 bitmap = get_pageblock_bitmap(zone, pfn); 6009 bitmap = get_pageblock_bitmap(zone, pfn);
6000 bitidx = pfn_to_bitidx(zone, pfn); 6010 bitidx = pfn_to_bitidx(zone, pfn);
6001 word_bitidx = bitidx / BITS_PER_LONG; 6011 word_bitidx = bitidx / BITS_PER_LONG;
6002 bitidx &= (BITS_PER_LONG-1); 6012 bitidx &= (BITS_PER_LONG-1);
6003 6013
6004 word = bitmap[word_bitidx]; 6014 word = bitmap[word_bitidx];
6005 bitidx += end_bitidx; 6015 bitidx += end_bitidx;
6006 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6016 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6007 } 6017 }
6008 6018
6009 /** 6019 /**
6010 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6020 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6011 * @page: The page within the block of interest 6021 * @page: The page within the block of interest
6012 * @start_bitidx: The first bit of interest 6022 * @start_bitidx: The first bit of interest
6013 * @end_bitidx: The last bit of interest 6023 * @end_bitidx: The last bit of interest
6014 * @flags: The flags to set 6024 * @flags: The flags to set
6015 */ 6025 */
6016 void set_pageblock_flags_mask(struct page *page, unsigned long flags, 6026 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
6017 unsigned long end_bitidx, 6027 unsigned long end_bitidx,
6018 unsigned long mask) 6028 unsigned long mask)
6019 { 6029 {
6020 struct zone *zone; 6030 struct zone *zone;
6021 unsigned long *bitmap; 6031 unsigned long *bitmap;
6022 unsigned long pfn, bitidx, word_bitidx; 6032 unsigned long pfn, bitidx, word_bitidx;
6023 unsigned long old_word, word; 6033 unsigned long old_word, word;
6024 6034
6025 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6035 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6026 6036
6027 zone = page_zone(page); 6037 zone = page_zone(page);
6028 pfn = page_to_pfn(page); 6038 pfn = page_to_pfn(page);
6029 bitmap = get_pageblock_bitmap(zone, pfn); 6039 bitmap = get_pageblock_bitmap(zone, pfn);
6030 bitidx = pfn_to_bitidx(zone, pfn); 6040 bitidx = pfn_to_bitidx(zone, pfn);
6031 word_bitidx = bitidx / BITS_PER_LONG; 6041 word_bitidx = bitidx / BITS_PER_LONG;
6032 bitidx &= (BITS_PER_LONG-1); 6042 bitidx &= (BITS_PER_LONG-1);
6033 6043
6034 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6044 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
6035 6045
6036 bitidx += end_bitidx; 6046 bitidx += end_bitidx;
6037 mask <<= (BITS_PER_LONG - bitidx - 1); 6047 mask <<= (BITS_PER_LONG - bitidx - 1);
6038 flags <<= (BITS_PER_LONG - bitidx - 1); 6048 flags <<= (BITS_PER_LONG - bitidx - 1);
6039 6049
6040 word = ACCESS_ONCE(bitmap[word_bitidx]); 6050 word = ACCESS_ONCE(bitmap[word_bitidx]);
6041 for (;;) { 6051 for (;;) {
6042 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6052 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6043 if (word == old_word) 6053 if (word == old_word)
6044 break; 6054 break;
6045 word = old_word; 6055 word = old_word;
6046 } 6056 }
6047 } 6057 }
6048 6058
6049 /* 6059 /*
6050 * This function checks whether pageblock includes unmovable pages or not. 6060 * This function checks whether pageblock includes unmovable pages or not.
6051 * If @count is not zero, it is okay to include less @count unmovable pages 6061 * If @count is not zero, it is okay to include less @count unmovable pages
6052 * 6062 *
6053 * PageLRU check without isolation or lru_lock could race so that 6063 * PageLRU check without isolation or lru_lock could race so that
6054 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6064 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6055 * expect this function should be exact. 6065 * expect this function should be exact.
6056 */ 6066 */
6057 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6067 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6058 bool skip_hwpoisoned_pages) 6068 bool skip_hwpoisoned_pages)
6059 { 6069 {
6060 unsigned long pfn, iter, found; 6070 unsigned long pfn, iter, found;
6061 int mt; 6071 int mt;
6062 6072
6063 /* 6073 /*
6064 * For avoiding noise data, lru_add_drain_all() should be called 6074 * For avoiding noise data, lru_add_drain_all() should be called
6065 * If ZONE_MOVABLE, the zone never contains unmovable pages 6075 * If ZONE_MOVABLE, the zone never contains unmovable pages
6066 */ 6076 */
6067 if (zone_idx(zone) == ZONE_MOVABLE) 6077 if (zone_idx(zone) == ZONE_MOVABLE)
6068 return false; 6078 return false;
6069 mt = get_pageblock_migratetype(page); 6079 mt = get_pageblock_migratetype(page);
6070 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6080 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
6071 return false; 6081 return false;
6072 6082
6073 pfn = page_to_pfn(page); 6083 pfn = page_to_pfn(page);
6074 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6084 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6075 unsigned long check = pfn + iter; 6085 unsigned long check = pfn + iter;
6076 6086
6077 if (!pfn_valid_within(check)) 6087 if (!pfn_valid_within(check))
6078 continue; 6088 continue;
6079 6089
6080 page = pfn_to_page(check); 6090 page = pfn_to_page(check);
6081 6091
6082 /* 6092 /*
6083 * Hugepages are not in LRU lists, but they're movable. 6093 * Hugepages are not in LRU lists, but they're movable.
6084 * We need not scan over tail pages bacause we don't 6094 * We need not scan over tail pages bacause we don't
6085 * handle each tail page individually in migration. 6095 * handle each tail page individually in migration.
6086 */ 6096 */
6087 if (PageHuge(page)) { 6097 if (PageHuge(page)) {
6088 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6098 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6089 continue; 6099 continue;
6090 } 6100 }
6091 6101
6092 /* 6102 /*
6093 * We can't use page_count without pin a page 6103 * We can't use page_count without pin a page
6094 * because another CPU can free compound page. 6104 * because another CPU can free compound page.
6095 * This check already skips compound tails of THP 6105 * This check already skips compound tails of THP
6096 * because their page->_count is zero at all time. 6106 * because their page->_count is zero at all time.
6097 */ 6107 */
6098 if (!atomic_read(&page->_count)) { 6108 if (!atomic_read(&page->_count)) {
6099 if (PageBuddy(page)) 6109 if (PageBuddy(page))
6100 iter += (1 << page_order(page)) - 1; 6110 iter += (1 << page_order(page)) - 1;
6101 continue; 6111 continue;
6102 } 6112 }
6103 6113
6104 /* 6114 /*
6105 * The HWPoisoned page may be not in buddy system, and 6115 * The HWPoisoned page may be not in buddy system, and
6106 * page_count() is not 0. 6116 * page_count() is not 0.
6107 */ 6117 */
6108 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6118 if (skip_hwpoisoned_pages && PageHWPoison(page))
6109 continue; 6119 continue;
6110 6120
6111 if (!PageLRU(page)) 6121 if (!PageLRU(page))
6112 found++; 6122 found++;
6113 /* 6123 /*
6114 * If there are RECLAIMABLE pages, we need to check it. 6124 * If there are RECLAIMABLE pages, we need to check it.
6115 * But now, memory offline itself doesn't call shrink_slab() 6125 * But now, memory offline itself doesn't call shrink_slab()
6116 * and it still to be fixed. 6126 * and it still to be fixed.
6117 */ 6127 */
6118 /* 6128 /*
6119 * If the page is not RAM, page_count()should be 0. 6129 * If the page is not RAM, page_count()should be 0.
6120 * we don't need more check. This is an _used_ not-movable page. 6130 * we don't need more check. This is an _used_ not-movable page.
6121 * 6131 *
6122 * The problematic thing here is PG_reserved pages. PG_reserved 6132 * The problematic thing here is PG_reserved pages. PG_reserved
6123 * is set to both of a memory hole page and a _used_ kernel 6133 * is set to both of a memory hole page and a _used_ kernel
6124 * page at boot. 6134 * page at boot.
6125 */ 6135 */
6126 if (found > count) 6136 if (found > count)
6127 return true; 6137 return true;
6128 } 6138 }
6129 return false; 6139 return false;
6130 } 6140 }
6131 6141
6132 bool is_pageblock_removable_nolock(struct page *page) 6142 bool is_pageblock_removable_nolock(struct page *page)
6133 { 6143 {
6134 struct zone *zone; 6144 struct zone *zone;
6135 unsigned long pfn; 6145 unsigned long pfn;
6136 6146
6137 /* 6147 /*
6138 * We have to be careful here because we are iterating over memory 6148 * We have to be careful here because we are iterating over memory
6139 * sections which are not zone aware so we might end up outside of 6149 * sections which are not zone aware so we might end up outside of
6140 * the zone but still within the section. 6150 * the zone but still within the section.
6141 * We have to take care about the node as well. If the node is offline 6151 * We have to take care about the node as well. If the node is offline
6142 * its NODE_DATA will be NULL - see page_zone. 6152 * its NODE_DATA will be NULL - see page_zone.
6143 */ 6153 */
6144 if (!node_online(page_to_nid(page))) 6154 if (!node_online(page_to_nid(page)))
6145 return false; 6155 return false;
6146 6156
6147 zone = page_zone(page); 6157 zone = page_zone(page);
6148 pfn = page_to_pfn(page); 6158 pfn = page_to_pfn(page);
6149 if (!zone_spans_pfn(zone, pfn)) 6159 if (!zone_spans_pfn(zone, pfn))
6150 return false; 6160 return false;
6151 6161
6152 return !has_unmovable_pages(zone, page, 0, true); 6162 return !has_unmovable_pages(zone, page, 0, true);
6153 } 6163 }
6154 6164
6155 #ifdef CONFIG_CMA 6165 #ifdef CONFIG_CMA
6156 6166
6157 static unsigned long pfn_max_align_down(unsigned long pfn) 6167 static unsigned long pfn_max_align_down(unsigned long pfn)
6158 { 6168 {
6159 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6169 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6160 pageblock_nr_pages) - 1); 6170 pageblock_nr_pages) - 1);
6161 } 6171 }
6162 6172
6163 static unsigned long pfn_max_align_up(unsigned long pfn) 6173 static unsigned long pfn_max_align_up(unsigned long pfn)
6164 { 6174 {
6165 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6175 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6166 pageblock_nr_pages)); 6176 pageblock_nr_pages));
6167 } 6177 }
6168 6178
6169 /* [start, end) must belong to a single zone. */ 6179 /* [start, end) must belong to a single zone. */
6170 static int __alloc_contig_migrate_range(struct compact_control *cc, 6180 static int __alloc_contig_migrate_range(struct compact_control *cc,
6171 unsigned long start, unsigned long end) 6181 unsigned long start, unsigned long end)
6172 { 6182 {
6173 /* This function is based on compact_zone() from compaction.c. */ 6183 /* This function is based on compact_zone() from compaction.c. */
6174 unsigned long nr_reclaimed; 6184 unsigned long nr_reclaimed;
6175 unsigned long pfn = start; 6185 unsigned long pfn = start;
6176 unsigned int tries = 0; 6186 unsigned int tries = 0;
6177 int ret = 0; 6187 int ret = 0;
6178 6188
6179 migrate_prep(); 6189 migrate_prep();
6180 6190
6181 while (pfn < end || !list_empty(&cc->migratepages)) { 6191 while (pfn < end || !list_empty(&cc->migratepages)) {
6182 if (fatal_signal_pending(current)) { 6192 if (fatal_signal_pending(current)) {
6183 ret = -EINTR; 6193 ret = -EINTR;
6184 break; 6194 break;
6185 } 6195 }
6186 6196
6187 if (list_empty(&cc->migratepages)) { 6197 if (list_empty(&cc->migratepages)) {
6188 cc->nr_migratepages = 0; 6198 cc->nr_migratepages = 0;
6189 pfn = isolate_migratepages_range(cc->zone, cc, 6199 pfn = isolate_migratepages_range(cc->zone, cc,
6190 pfn, end, true); 6200 pfn, end, true);
6191 if (!pfn) { 6201 if (!pfn) {
6192 ret = -EINTR; 6202 ret = -EINTR;
6193 break; 6203 break;
6194 } 6204 }
6195 tries = 0; 6205 tries = 0;
6196 } else if (++tries == 5) { 6206 } else if (++tries == 5) {
6197 ret = ret < 0 ? ret : -EBUSY; 6207 ret = ret < 0 ? ret : -EBUSY;
6198 break; 6208 break;
6199 } 6209 }
6200 6210
6201 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6211 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6202 &cc->migratepages); 6212 &cc->migratepages);
6203 cc->nr_migratepages -= nr_reclaimed; 6213 cc->nr_migratepages -= nr_reclaimed;
6204 6214
6205 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6215 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6206 NULL, 0, cc->mode, MR_CMA); 6216 NULL, 0, cc->mode, MR_CMA);
6207 } 6217 }
6208 if (ret < 0) { 6218 if (ret < 0) {
6209 putback_movable_pages(&cc->migratepages); 6219 putback_movable_pages(&cc->migratepages);
6210 return ret; 6220 return ret;
6211 } 6221 }
6212 return 0; 6222 return 0;
6213 } 6223 }
6214 6224
6215 /** 6225 /**
6216 * alloc_contig_range() -- tries to allocate given range of pages 6226 * alloc_contig_range() -- tries to allocate given range of pages
6217 * @start: start PFN to allocate 6227 * @start: start PFN to allocate
6218 * @end: one-past-the-last PFN to allocate 6228 * @end: one-past-the-last PFN to allocate
6219 * @migratetype: migratetype of the underlaying pageblocks (either 6229 * @migratetype: migratetype of the underlaying pageblocks (either
6220 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6230 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6221 * in range must have the same migratetype and it must 6231 * in range must have the same migratetype and it must
6222 * be either of the two. 6232 * be either of the two.
6223 * 6233 *
6224 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6234 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6225 * aligned, however it's the caller's responsibility to guarantee that 6235 * aligned, however it's the caller's responsibility to guarantee that
6226 * we are the only thread that changes migrate type of pageblocks the 6236 * we are the only thread that changes migrate type of pageblocks the
6227 * pages fall in. 6237 * pages fall in.
6228 * 6238 *
6229 * The PFN range must belong to a single zone. 6239 * The PFN range must belong to a single zone.
6230 * 6240 *
6231 * Returns zero on success or negative error code. On success all 6241 * Returns zero on success or negative error code. On success all
6232 * pages which PFN is in [start, end) are allocated for the caller and 6242 * pages which PFN is in [start, end) are allocated for the caller and
6233 * need to be freed with free_contig_range(). 6243 * need to be freed with free_contig_range().
6234 */ 6244 */
6235 int alloc_contig_range(unsigned long start, unsigned long end, 6245 int alloc_contig_range(unsigned long start, unsigned long end,
6236 unsigned migratetype) 6246 unsigned migratetype)
6237 { 6247 {
6238 unsigned long outer_start, outer_end; 6248 unsigned long outer_start, outer_end;
6239 int ret = 0, order; 6249 int ret = 0, order;
6240 6250
6241 struct compact_control cc = { 6251 struct compact_control cc = {
6242 .nr_migratepages = 0, 6252 .nr_migratepages = 0,
6243 .order = -1, 6253 .order = -1,
6244 .zone = page_zone(pfn_to_page(start)), 6254 .zone = page_zone(pfn_to_page(start)),
6245 .mode = MIGRATE_SYNC, 6255 .mode = MIGRATE_SYNC,
6246 .ignore_skip_hint = true, 6256 .ignore_skip_hint = true,
6247 }; 6257 };
6248 INIT_LIST_HEAD(&cc.migratepages); 6258 INIT_LIST_HEAD(&cc.migratepages);
6249 6259
6250 /* 6260 /*
6251 * What we do here is we mark all pageblocks in range as 6261 * What we do here is we mark all pageblocks in range as
6252 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6262 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6253 * have different sizes, and due to the way page allocator 6263 * have different sizes, and due to the way page allocator
6254 * work, we align the range to biggest of the two pages so 6264 * work, we align the range to biggest of the two pages so
6255 * that page allocator won't try to merge buddies from 6265 * that page allocator won't try to merge buddies from
6256 * different pageblocks and change MIGRATE_ISOLATE to some 6266 * different pageblocks and change MIGRATE_ISOLATE to some
6257 * other migration type. 6267 * other migration type.
6258 * 6268 *
6259 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6269 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6260 * migrate the pages from an unaligned range (ie. pages that 6270 * migrate the pages from an unaligned range (ie. pages that
6261 * we are interested in). This will put all the pages in 6271 * we are interested in). This will put all the pages in
6262 * range back to page allocator as MIGRATE_ISOLATE. 6272 * range back to page allocator as MIGRATE_ISOLATE.
6263 * 6273 *
6264 * When this is done, we take the pages in range from page 6274 * When this is done, we take the pages in range from page
6265 * allocator removing them from the buddy system. This way 6275 * allocator removing them from the buddy system. This way
6266 * page allocator will never consider using them. 6276 * page allocator will never consider using them.
6267 * 6277 *
6268 * This lets us mark the pageblocks back as 6278 * This lets us mark the pageblocks back as
6269 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6279 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6270 * aligned range but not in the unaligned, original range are 6280 * aligned range but not in the unaligned, original range are
6271 * put back to page allocator so that buddy can use them. 6281 * put back to page allocator so that buddy can use them.
6272 */ 6282 */
6273 6283
6274 ret = start_isolate_page_range(pfn_max_align_down(start), 6284 ret = start_isolate_page_range(pfn_max_align_down(start),
6275 pfn_max_align_up(end), migratetype, 6285 pfn_max_align_up(end), migratetype,
6276 false); 6286 false);
6277 if (ret) 6287 if (ret)
6278 return ret; 6288 return ret;
6279 6289
6280 ret = __alloc_contig_migrate_range(&cc, start, end); 6290 ret = __alloc_contig_migrate_range(&cc, start, end);
6281 if (ret) 6291 if (ret)
6282 goto done; 6292 goto done;
6283 6293
6284 /* 6294 /*
6285 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6295 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6286 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6296 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6287 * more, all pages in [start, end) are free in page allocator. 6297 * more, all pages in [start, end) are free in page allocator.
6288 * What we are going to do is to allocate all pages from 6298 * What we are going to do is to allocate all pages from
6289 * [start, end) (that is remove them from page allocator). 6299 * [start, end) (that is remove them from page allocator).
6290 * 6300 *
6291 * The only problem is that pages at the beginning and at the 6301 * The only problem is that pages at the beginning and at the
6292 * end of interesting range may be not aligned with pages that 6302 * end of interesting range may be not aligned with pages that
6293 * page allocator holds, ie. they can be part of higher order 6303 * page allocator holds, ie. they can be part of higher order
6294 * pages. Because of this, we reserve the bigger range and 6304 * pages. Because of this, we reserve the bigger range and
6295 * once this is done free the pages we are not interested in. 6305 * once this is done free the pages we are not interested in.
6296 * 6306 *
6297 * We don't have to hold zone->lock here because the pages are 6307 * We don't have to hold zone->lock here because the pages are
6298 * isolated thus they won't get removed from buddy. 6308 * isolated thus they won't get removed from buddy.
6299 */ 6309 */
6300 6310
6301 lru_add_drain_all(); 6311 lru_add_drain_all();
6302 drain_all_pages(); 6312 drain_all_pages();
6303 6313
6304 order = 0; 6314 order = 0;
6305 outer_start = start; 6315 outer_start = start;
6306 while (!PageBuddy(pfn_to_page(outer_start))) { 6316 while (!PageBuddy(pfn_to_page(outer_start))) {
6307 if (++order >= MAX_ORDER) { 6317 if (++order >= MAX_ORDER) {
6308 ret = -EBUSY; 6318 ret = -EBUSY;
6309 goto done; 6319 goto done;
6310 } 6320 }
6311 outer_start &= ~0UL << order; 6321 outer_start &= ~0UL << order;
6312 } 6322 }
6313 6323
6314 /* Make sure the range is really isolated. */ 6324 /* Make sure the range is really isolated. */
6315 if (test_pages_isolated(outer_start, end, false)) { 6325 if (test_pages_isolated(outer_start, end, false)) {
6316 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6326 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6317 outer_start, end); 6327 outer_start, end);
6318 ret = -EBUSY; 6328 ret = -EBUSY;
6319 goto done; 6329 goto done;
6320 } 6330 }
6321 6331
6322 6332
6323 /* Grab isolated pages from freelists. */ 6333 /* Grab isolated pages from freelists. */
6324 outer_end = isolate_freepages_range(&cc, outer_start, end); 6334 outer_end = isolate_freepages_range(&cc, outer_start, end);
6325 if (!outer_end) { 6335 if (!outer_end) {
6326 ret = -EBUSY; 6336 ret = -EBUSY;
6327 goto done; 6337 goto done;
6328 } 6338 }
6329 6339
6330 /* Free head and tail (if any) */ 6340 /* Free head and tail (if any) */
6331 if (start != outer_start) 6341 if (start != outer_start)
6332 free_contig_range(outer_start, start - outer_start); 6342 free_contig_range(outer_start, start - outer_start);
6333 if (end != outer_end) 6343 if (end != outer_end)
6334 free_contig_range(end, outer_end - end); 6344 free_contig_range(end, outer_end - end);
6335 6345
6336 done: 6346 done:
6337 undo_isolate_page_range(pfn_max_align_down(start), 6347 undo_isolate_page_range(pfn_max_align_down(start),
6338 pfn_max_align_up(end), migratetype); 6348 pfn_max_align_up(end), migratetype);
6339 return ret; 6349 return ret;
6340 } 6350 }
6341 6351
6342 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6352 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6343 { 6353 {
6344 unsigned int count = 0; 6354 unsigned int count = 0;
6345 6355
6346 for (; nr_pages--; pfn++) { 6356 for (; nr_pages--; pfn++) {
6347 struct page *page = pfn_to_page(pfn); 6357 struct page *page = pfn_to_page(pfn);
6348 6358
6349 count += page_count(page) != 1; 6359 count += page_count(page) != 1;
6350 __free_page(page); 6360 __free_page(page);
6351 } 6361 }
6352 WARN(count != 0, "%d pages are still in use!\n", count); 6362 WARN(count != 0, "%d pages are still in use!\n", count);
6353 } 6363 }
6354 #endif 6364 #endif
6355 6365
6356 #ifdef CONFIG_MEMORY_HOTPLUG 6366 #ifdef CONFIG_MEMORY_HOTPLUG
6357 /* 6367 /*
6358 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6368 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6359 * page high values need to be recalulated. 6369 * page high values need to be recalulated.
6360 */ 6370 */
6361 void __meminit zone_pcp_update(struct zone *zone) 6371 void __meminit zone_pcp_update(struct zone *zone)
6362 { 6372 {
6363 unsigned cpu; 6373 unsigned cpu;
6364 mutex_lock(&pcp_batch_high_lock); 6374 mutex_lock(&pcp_batch_high_lock);
6365 for_each_possible_cpu(cpu) 6375 for_each_possible_cpu(cpu)
6366 pageset_set_high_and_batch(zone, 6376 pageset_set_high_and_batch(zone,
6367 per_cpu_ptr(zone->pageset, cpu)); 6377 per_cpu_ptr(zone->pageset, cpu));
6368 mutex_unlock(&pcp_batch_high_lock); 6378 mutex_unlock(&pcp_batch_high_lock);
6369 } 6379 }
6370 #endif 6380 #endif
6371 6381
6372 void zone_pcp_reset(struct zone *zone) 6382 void zone_pcp_reset(struct zone *zone)
6373 { 6383 {
6374 unsigned long flags; 6384 unsigned long flags;
6375 int cpu; 6385 int cpu;
6376 struct per_cpu_pageset *pset; 6386 struct per_cpu_pageset *pset;
6377 6387
6378 /* avoid races with drain_pages() */ 6388 /* avoid races with drain_pages() */
6379 local_irq_save(flags); 6389 local_irq_save(flags);
6380 if (zone->pageset != &boot_pageset) { 6390 if (zone->pageset != &boot_pageset) {
6381 for_each_online_cpu(cpu) { 6391 for_each_online_cpu(cpu) {
6382 pset = per_cpu_ptr(zone->pageset, cpu); 6392 pset = per_cpu_ptr(zone->pageset, cpu);
6383 drain_zonestat(zone, pset); 6393 drain_zonestat(zone, pset);
6384 } 6394 }
6385 free_percpu(zone->pageset); 6395 free_percpu(zone->pageset);
6386 zone->pageset = &boot_pageset; 6396 zone->pageset = &boot_pageset;
6387 } 6397 }
6388 local_irq_restore(flags); 6398 local_irq_restore(flags);
6389 } 6399 }
6390 6400
6391 #ifdef CONFIG_MEMORY_HOTREMOVE 6401 #ifdef CONFIG_MEMORY_HOTREMOVE
6392 /* 6402 /*
6393 * All pages in the range must be isolated before calling this. 6403 * All pages in the range must be isolated before calling this.
6394 */ 6404 */
6395 void 6405 void
6396 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6406 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6397 { 6407 {
6398 struct page *page; 6408 struct page *page;
6399 struct zone *zone; 6409 struct zone *zone;
6400 int order, i; 6410 int order, i;
6401 unsigned long pfn; 6411 unsigned long pfn;
6402 unsigned long flags; 6412 unsigned long flags;
6403 /* find the first valid pfn */ 6413 /* find the first valid pfn */
6404 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6414 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6405 if (pfn_valid(pfn)) 6415 if (pfn_valid(pfn))
6406 break; 6416 break;
6407 if (pfn == end_pfn) 6417 if (pfn == end_pfn)
6408 return; 6418 return;
6409 zone = page_zone(pfn_to_page(pfn)); 6419 zone = page_zone(pfn_to_page(pfn));
6410 spin_lock_irqsave(&zone->lock, flags); 6420 spin_lock_irqsave(&zone->lock, flags);
6411 pfn = start_pfn; 6421 pfn = start_pfn;
6412 while (pfn < end_pfn) { 6422 while (pfn < end_pfn) {
6413 if (!pfn_valid(pfn)) { 6423 if (!pfn_valid(pfn)) {
6414 pfn++; 6424 pfn++;
6415 continue; 6425 continue;
6416 } 6426 }
6417 page = pfn_to_page(pfn); 6427 page = pfn_to_page(pfn);
6418 /* 6428 /*
6419 * The HWPoisoned page may be not in buddy system, and 6429 * The HWPoisoned page may be not in buddy system, and
6420 * page_count() is not 0. 6430 * page_count() is not 0.
6421 */ 6431 */
6422 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6432 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6423 pfn++; 6433 pfn++;
6424 SetPageReserved(page); 6434 SetPageReserved(page);
6425 continue; 6435 continue;
6426 } 6436 }
6427 6437
6428 BUG_ON(page_count(page)); 6438 BUG_ON(page_count(page));
6429 BUG_ON(!PageBuddy(page)); 6439 BUG_ON(!PageBuddy(page));
6430 order = page_order(page); 6440 order = page_order(page);
6431 #ifdef CONFIG_DEBUG_VM 6441 #ifdef CONFIG_DEBUG_VM
6432 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6442 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6433 pfn, 1 << order, end_pfn); 6443 pfn, 1 << order, end_pfn);
6434 #endif 6444 #endif
6435 list_del(&page->lru); 6445 list_del(&page->lru);
6436 rmv_page_order(page); 6446 rmv_page_order(page);
6437 zone->free_area[order].nr_free--; 6447 zone->free_area[order].nr_free--;
6438 for (i = 0; i < (1 << order); i++) 6448 for (i = 0; i < (1 << order); i++)
6439 SetPageReserved((page+i)); 6449 SetPageReserved((page+i));
6440 pfn += (1 << order); 6450 pfn += (1 << order);
6441 } 6451 }
6442 spin_unlock_irqrestore(&zone->lock, flags); 6452 spin_unlock_irqrestore(&zone->lock, flags);
6443 } 6453 }
6444 #endif 6454 #endif
6445 6455
6446 #ifdef CONFIG_MEMORY_FAILURE 6456 #ifdef CONFIG_MEMORY_FAILURE
6447 bool is_free_buddy_page(struct page *page) 6457 bool is_free_buddy_page(struct page *page)
6448 { 6458 {
6449 struct zone *zone = page_zone(page); 6459 struct zone *zone = page_zone(page);
6450 unsigned long pfn = page_to_pfn(page); 6460 unsigned long pfn = page_to_pfn(page);
6451 unsigned long flags; 6461 unsigned long flags;
6452 int order; 6462 int order;
6453 6463
6454 spin_lock_irqsave(&zone->lock, flags); 6464 spin_lock_irqsave(&zone->lock, flags);
6455 for (order = 0; order < MAX_ORDER; order++) { 6465 for (order = 0; order < MAX_ORDER; order++) {
6456 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6466 struct page *page_head = page - (pfn & ((1 << order) - 1));
6457 6467
6458 if (PageBuddy(page_head) && page_order(page_head) >= order) 6468 if (PageBuddy(page_head) && page_order(page_head) >= order)
6459 break; 6469 break;
6460 } 6470 }
6461 spin_unlock_irqrestore(&zone->lock, flags); 6471 spin_unlock_irqrestore(&zone->lock, flags);
6462 6472
6463 return order < MAX_ORDER; 6473 return order < MAX_ORDER;
6464 } 6474 }
6465 #endif 6475 #endif
6466 6476
6467 static const struct trace_print_flags pageflag_names[] = { 6477 static const struct trace_print_flags pageflag_names[] = {
6468 {1UL << PG_locked, "locked" }, 6478 {1UL << PG_locked, "locked" },
6469 {1UL << PG_error, "error" }, 6479 {1UL << PG_error, "error" },
6470 {1UL << PG_referenced, "referenced" }, 6480 {1UL << PG_referenced, "referenced" },
6471 {1UL << PG_uptodate, "uptodate" }, 6481 {1UL << PG_uptodate, "uptodate" },
6472 {1UL << PG_dirty, "dirty" }, 6482 {1UL << PG_dirty, "dirty" },
6473 {1UL << PG_lru, "lru" }, 6483 {1UL << PG_lru, "lru" },
6474 {1UL << PG_active, "active" }, 6484 {1UL << PG_active, "active" },
6475 {1UL << PG_slab, "slab" }, 6485 {1UL << PG_slab, "slab" },
6476 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6486 {1UL << PG_owner_priv_1, "owner_priv_1" },
6477 {1UL << PG_arch_1, "arch_1" }, 6487 {1UL << PG_arch_1, "arch_1" },
6478 {1UL << PG_reserved, "reserved" }, 6488 {1UL << PG_reserved, "reserved" },
6479 {1UL << PG_private, "private" }, 6489 {1UL << PG_private, "private" },
6480 {1UL << PG_private_2, "private_2" }, 6490 {1UL << PG_private_2, "private_2" },
6481 {1UL << PG_writeback, "writeback" }, 6491 {1UL << PG_writeback, "writeback" },
6482 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6492 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6483 {1UL << PG_head, "head" }, 6493 {1UL << PG_head, "head" },
6484 {1UL << PG_tail, "tail" }, 6494 {1UL << PG_tail, "tail" },
6485 #else 6495 #else
6486 {1UL << PG_compound, "compound" }, 6496 {1UL << PG_compound, "compound" },
6487 #endif 6497 #endif
6488 {1UL << PG_swapcache, "swapcache" }, 6498 {1UL << PG_swapcache, "swapcache" },
6489 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6499 {1UL << PG_mappedtodisk, "mappedtodisk" },
6490 {1UL << PG_reclaim, "reclaim" }, 6500 {1UL << PG_reclaim, "reclaim" },
6491 {1UL << PG_swapbacked, "swapbacked" }, 6501 {1UL << PG_swapbacked, "swapbacked" },
6492 {1UL << PG_unevictable, "unevictable" }, 6502 {1UL << PG_unevictable, "unevictable" },
6493 #ifdef CONFIG_MMU 6503 #ifdef CONFIG_MMU
6494 {1UL << PG_mlocked, "mlocked" }, 6504 {1UL << PG_mlocked, "mlocked" },
6495 #endif 6505 #endif
6496 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6506 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6497 {1UL << PG_uncached, "uncached" }, 6507 {1UL << PG_uncached, "uncached" },
6498 #endif 6508 #endif
6499 #ifdef CONFIG_MEMORY_FAILURE 6509 #ifdef CONFIG_MEMORY_FAILURE
6500 {1UL << PG_hwpoison, "hwpoison" }, 6510 {1UL << PG_hwpoison, "hwpoison" },
6501 #endif 6511 #endif
6502 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6512 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6503 {1UL << PG_compound_lock, "compound_lock" }, 6513 {1UL << PG_compound_lock, "compound_lock" },
6504 #endif 6514 #endif
6505 }; 6515 };
6506 6516
6507 static void dump_page_flags(unsigned long flags) 6517 static void dump_page_flags(unsigned long flags)
6508 { 6518 {
6509 const char *delim = ""; 6519 const char *delim = "";
6510 unsigned long mask; 6520 unsigned long mask;
6511 int i; 6521 int i;
6512 6522
6513 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6523 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6514 6524
6515 printk(KERN_ALERT "page flags: %#lx(", flags); 6525 printk(KERN_ALERT "page flags: %#lx(", flags);
6516 6526
6517 /* remove zone id */ 6527 /* remove zone id */
6518 flags &= (1UL << NR_PAGEFLAGS) - 1; 6528 flags &= (1UL << NR_PAGEFLAGS) - 1;
6519 6529
6520 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6530 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6521 6531
6522 mask = pageflag_names[i].mask; 6532 mask = pageflag_names[i].mask;
6523 if ((flags & mask) != mask) 6533 if ((flags & mask) != mask)
6524 continue; 6534 continue;
6525 6535
6526 flags &= ~mask; 6536 flags &= ~mask;
6527 printk("%s%s", delim, pageflag_names[i].name); 6537 printk("%s%s", delim, pageflag_names[i].name);
6528 delim = "|"; 6538 delim = "|";
6529 } 6539 }
6530 6540
6531 /* check for left over flags */ 6541 /* check for left over flags */
6532 if (flags) 6542 if (flags)
6533 printk("%s%#lx", delim, flags); 6543 printk("%s%#lx", delim, flags);
6534 6544
6535 printk(")\n"); 6545 printk(")\n");
6536 } 6546 }
6537 6547
6538 void dump_page(struct page *page) 6548 void dump_page(struct page *page)
6539 { 6549 {
6540 printk(KERN_ALERT 6550 printk(KERN_ALERT
6541 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6551 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6542 page, atomic_read(&page->_count), page_mapcount(page), 6552 page, atomic_read(&page->_count), page_mapcount(page),
6543 page->mapping, page->index); 6553 page->mapping, page->index);
6544 dump_page_flags(page->flags); 6554 dump_page_flags(page->flags);
6545 mem_cgroup_print_bad_page(page); 6555 mem_cgroup_print_bad_page(page);