Commit 35515de99f95c55f6c416fc7cc2b16832b9f58ee

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent d6217476e1

mm: page_alloc: take the ALLOC_NO_WATERMARK check out of the fast path

commit 5dab29113ca56335c78be3f98bf5ddf2ef8eb6a6 upstream.

ALLOC_NO_WATERMARK is set in a few cases.  Always by kswapd, always for
__GFP_MEMALLOC, sometimes for swap-over-nfs, tasks etc.  Each of these
cases are relatively rare events but the ALLOC_NO_WATERMARK check is an
unlikely branch in the fast path.  This patch moves the check out of the
fast path and after it has been determined that the watermarks have not
been met.  This helps the common fast path at the cost of making the slow
path slower and hitting kswapd with a performance cost.  It's a reasonable
tradeoff.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 5 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 #define MIN_PERCPU_PAGELIST_FRACTION (8) 72 #define MIN_PERCPU_PAGELIST_FRACTION (8)
73 73
74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75 DEFINE_PER_CPU(int, numa_node); 75 DEFINE_PER_CPU(int, numa_node);
76 EXPORT_PER_CPU_SYMBOL(numa_node); 76 EXPORT_PER_CPU_SYMBOL(numa_node);
77 #endif 77 #endif
78 78
79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
80 /* 80 /*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>. 84 * defined in <linux/topology.h>.
85 */ 85 */
86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 87 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
88 #endif 88 #endif
89 89
90 /* 90 /*
91 * Array of node states. 91 * Array of node states.
92 */ 92 */
93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
94 [N_POSSIBLE] = NODE_MASK_ALL, 94 [N_POSSIBLE] = NODE_MASK_ALL,
95 [N_ONLINE] = { { [0] = 1UL } }, 95 [N_ONLINE] = { { [0] = 1UL } },
96 #ifndef CONFIG_NUMA 96 #ifndef CONFIG_NUMA
97 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 97 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
98 #ifdef CONFIG_HIGHMEM 98 #ifdef CONFIG_HIGHMEM
99 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 99 [N_HIGH_MEMORY] = { { [0] = 1UL } },
100 #endif 100 #endif
101 #ifdef CONFIG_MOVABLE_NODE 101 #ifdef CONFIG_MOVABLE_NODE
102 [N_MEMORY] = { { [0] = 1UL } }, 102 [N_MEMORY] = { { [0] = 1UL } },
103 #endif 103 #endif
104 [N_CPU] = { { [0] = 1UL } }, 104 [N_CPU] = { { [0] = 1UL } },
105 #endif /* NUMA */ 105 #endif /* NUMA */
106 }; 106 };
107 EXPORT_SYMBOL(node_states); 107 EXPORT_SYMBOL(node_states);
108 108
109 /* Protect totalram_pages and zone->managed_pages */ 109 /* Protect totalram_pages and zone->managed_pages */
110 static DEFINE_SPINLOCK(managed_page_count_lock); 110 static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112 unsigned long totalram_pages __read_mostly; 112 unsigned long totalram_pages __read_mostly;
113 unsigned long totalreserve_pages __read_mostly; 113 unsigned long totalreserve_pages __read_mostly;
114 /* 114 /*
115 * When calculating the number of globally allowed dirty pages, there 115 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 116 * is a certain number of per-zone reserves that should not be
117 * considered dirtyable memory. This is the sum of those reserves 117 * considered dirtyable memory. This is the sum of those reserves
118 * over all existing zones that contribute dirtyable memory. 118 * over all existing zones that contribute dirtyable memory.
119 */ 119 */
120 unsigned long dirty_balance_reserve __read_mostly; 120 unsigned long dirty_balance_reserve __read_mostly;
121 121
122 int percpu_pagelist_fraction; 122 int percpu_pagelist_fraction;
123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
124 124
125 #ifdef CONFIG_PM_SLEEP 125 #ifdef CONFIG_PM_SLEEP
126 /* 126 /*
127 * The following functions are used by the suspend/hibernate code to temporarily 127 * The following functions are used by the suspend/hibernate code to temporarily
128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
129 * while devices are suspended. To avoid races with the suspend/hibernate code, 129 * while devices are suspended. To avoid races with the suspend/hibernate code,
130 * they should always be called with pm_mutex held (gfp_allowed_mask also should 130 * they should always be called with pm_mutex held (gfp_allowed_mask also should
131 * only be modified with pm_mutex held, unless the suspend/hibernate code is 131 * only be modified with pm_mutex held, unless the suspend/hibernate code is
132 * guaranteed not to run in parallel with that modification). 132 * guaranteed not to run in parallel with that modification).
133 */ 133 */
134 134
135 static gfp_t saved_gfp_mask; 135 static gfp_t saved_gfp_mask;
136 136
137 void pm_restore_gfp_mask(void) 137 void pm_restore_gfp_mask(void)
138 { 138 {
139 WARN_ON(!mutex_is_locked(&pm_mutex)); 139 WARN_ON(!mutex_is_locked(&pm_mutex));
140 if (saved_gfp_mask) { 140 if (saved_gfp_mask) {
141 gfp_allowed_mask = saved_gfp_mask; 141 gfp_allowed_mask = saved_gfp_mask;
142 saved_gfp_mask = 0; 142 saved_gfp_mask = 0;
143 } 143 }
144 } 144 }
145 145
146 void pm_restrict_gfp_mask(void) 146 void pm_restrict_gfp_mask(void)
147 { 147 {
148 WARN_ON(!mutex_is_locked(&pm_mutex)); 148 WARN_ON(!mutex_is_locked(&pm_mutex));
149 WARN_ON(saved_gfp_mask); 149 WARN_ON(saved_gfp_mask);
150 saved_gfp_mask = gfp_allowed_mask; 150 saved_gfp_mask = gfp_allowed_mask;
151 gfp_allowed_mask &= ~GFP_IOFS; 151 gfp_allowed_mask &= ~GFP_IOFS;
152 } 152 }
153 153
154 bool pm_suspended_storage(void) 154 bool pm_suspended_storage(void)
155 { 155 {
156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
157 return false; 157 return false;
158 return true; 158 return true;
159 } 159 }
160 #endif /* CONFIG_PM_SLEEP */ 160 #endif /* CONFIG_PM_SLEEP */
161 161
162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
163 int pageblock_order __read_mostly; 163 int pageblock_order __read_mostly;
164 #endif 164 #endif
165 165
166 static void __free_pages_ok(struct page *page, unsigned int order); 166 static void __free_pages_ok(struct page *page, unsigned int order);
167 167
168 /* 168 /*
169 * results with 256, 32 in the lowmem_reserve sysctl: 169 * results with 256, 32 in the lowmem_reserve sysctl:
170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
171 * 1G machine -> (16M dma, 784M normal, 224M high) 171 * 1G machine -> (16M dma, 784M normal, 224M high)
172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
175 * 175 *
176 * TBD: should special case ZONE_DMA32 machines here - in those we normally 176 * TBD: should special case ZONE_DMA32 machines here - in those we normally
177 * don't need any ZONE_NORMAL reservation 177 * don't need any ZONE_NORMAL reservation
178 */ 178 */
179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
180 #ifdef CONFIG_ZONE_DMA 180 #ifdef CONFIG_ZONE_DMA
181 256, 181 256,
182 #endif 182 #endif
183 #ifdef CONFIG_ZONE_DMA32 183 #ifdef CONFIG_ZONE_DMA32
184 256, 184 256,
185 #endif 185 #endif
186 #ifdef CONFIG_HIGHMEM 186 #ifdef CONFIG_HIGHMEM
187 32, 187 32,
188 #endif 188 #endif
189 32, 189 32,
190 }; 190 };
191 191
192 EXPORT_SYMBOL(totalram_pages); 192 EXPORT_SYMBOL(totalram_pages);
193 193
194 static char * const zone_names[MAX_NR_ZONES] = { 194 static char * const zone_names[MAX_NR_ZONES] = {
195 #ifdef CONFIG_ZONE_DMA 195 #ifdef CONFIG_ZONE_DMA
196 "DMA", 196 "DMA",
197 #endif 197 #endif
198 #ifdef CONFIG_ZONE_DMA32 198 #ifdef CONFIG_ZONE_DMA32
199 "DMA32", 199 "DMA32",
200 #endif 200 #endif
201 "Normal", 201 "Normal",
202 #ifdef CONFIG_HIGHMEM 202 #ifdef CONFIG_HIGHMEM
203 "HighMem", 203 "HighMem",
204 #endif 204 #endif
205 "Movable", 205 "Movable",
206 }; 206 };
207 207
208 int min_free_kbytes = 1024; 208 int min_free_kbytes = 1024;
209 int user_min_free_kbytes; 209 int user_min_free_kbytes;
210 210
211 static unsigned long __meminitdata nr_kernel_pages; 211 static unsigned long __meminitdata nr_kernel_pages;
212 static unsigned long __meminitdata nr_all_pages; 212 static unsigned long __meminitdata nr_all_pages;
213 static unsigned long __meminitdata dma_reserve; 213 static unsigned long __meminitdata dma_reserve;
214 214
215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
218 static unsigned long __initdata required_kernelcore; 218 static unsigned long __initdata required_kernelcore;
219 static unsigned long __initdata required_movablecore; 219 static unsigned long __initdata required_movablecore;
220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
221 221
222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
223 int movable_zone; 223 int movable_zone;
224 EXPORT_SYMBOL(movable_zone); 224 EXPORT_SYMBOL(movable_zone);
225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
226 226
227 #if MAX_NUMNODES > 1 227 #if MAX_NUMNODES > 1
228 int nr_node_ids __read_mostly = MAX_NUMNODES; 228 int nr_node_ids __read_mostly = MAX_NUMNODES;
229 int nr_online_nodes __read_mostly = 1; 229 int nr_online_nodes __read_mostly = 1;
230 EXPORT_SYMBOL(nr_node_ids); 230 EXPORT_SYMBOL(nr_node_ids);
231 EXPORT_SYMBOL(nr_online_nodes); 231 EXPORT_SYMBOL(nr_online_nodes);
232 #endif 232 #endif
233 233
234 int page_group_by_mobility_disabled __read_mostly; 234 int page_group_by_mobility_disabled __read_mostly;
235 235
236 void set_pageblock_migratetype(struct page *page, int migratetype) 236 void set_pageblock_migratetype(struct page *page, int migratetype)
237 { 237 {
238 238
239 if (unlikely(page_group_by_mobility_disabled)) 239 if (unlikely(page_group_by_mobility_disabled))
240 migratetype = MIGRATE_UNMOVABLE; 240 migratetype = MIGRATE_UNMOVABLE;
241 241
242 set_pageblock_flags_group(page, (unsigned long)migratetype, 242 set_pageblock_flags_group(page, (unsigned long)migratetype,
243 PB_migrate, PB_migrate_end); 243 PB_migrate, PB_migrate_end);
244 } 244 }
245 245
246 bool oom_killer_disabled __read_mostly; 246 bool oom_killer_disabled __read_mostly;
247 247
248 #ifdef CONFIG_DEBUG_VM 248 #ifdef CONFIG_DEBUG_VM
249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
250 { 250 {
251 int ret = 0; 251 int ret = 0;
252 unsigned seq; 252 unsigned seq;
253 unsigned long pfn = page_to_pfn(page); 253 unsigned long pfn = page_to_pfn(page);
254 unsigned long sp, start_pfn; 254 unsigned long sp, start_pfn;
255 255
256 do { 256 do {
257 seq = zone_span_seqbegin(zone); 257 seq = zone_span_seqbegin(zone);
258 start_pfn = zone->zone_start_pfn; 258 start_pfn = zone->zone_start_pfn;
259 sp = zone->spanned_pages; 259 sp = zone->spanned_pages;
260 if (!zone_spans_pfn(zone, pfn)) 260 if (!zone_spans_pfn(zone, pfn))
261 ret = 1; 261 ret = 1;
262 } while (zone_span_seqretry(zone, seq)); 262 } while (zone_span_seqretry(zone, seq));
263 263
264 if (ret) 264 if (ret)
265 pr_err("page %lu outside zone [ %lu - %lu ]\n", 265 pr_err("page %lu outside zone [ %lu - %lu ]\n",
266 pfn, start_pfn, start_pfn + sp); 266 pfn, start_pfn, start_pfn + sp);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 static int page_is_consistent(struct zone *zone, struct page *page) 271 static int page_is_consistent(struct zone *zone, struct page *page)
272 { 272 {
273 if (!pfn_valid_within(page_to_pfn(page))) 273 if (!pfn_valid_within(page_to_pfn(page)))
274 return 0; 274 return 0;
275 if (zone != page_zone(page)) 275 if (zone != page_zone(page))
276 return 0; 276 return 0;
277 277
278 return 1; 278 return 1;
279 } 279 }
280 /* 280 /*
281 * Temporary debugging check for pages not lying within a given zone. 281 * Temporary debugging check for pages not lying within a given zone.
282 */ 282 */
283 static int bad_range(struct zone *zone, struct page *page) 283 static int bad_range(struct zone *zone, struct page *page)
284 { 284 {
285 if (page_outside_zone_boundaries(zone, page)) 285 if (page_outside_zone_boundaries(zone, page))
286 return 1; 286 return 1;
287 if (!page_is_consistent(zone, page)) 287 if (!page_is_consistent(zone, page))
288 return 1; 288 return 1;
289 289
290 return 0; 290 return 0;
291 } 291 }
292 #else 292 #else
293 static inline int bad_range(struct zone *zone, struct page *page) 293 static inline int bad_range(struct zone *zone, struct page *page)
294 { 294 {
295 return 0; 295 return 0;
296 } 296 }
297 #endif 297 #endif
298 298
299 static void bad_page(struct page *page) 299 static void bad_page(struct page *page)
300 { 300 {
301 static unsigned long resume; 301 static unsigned long resume;
302 static unsigned long nr_shown; 302 static unsigned long nr_shown;
303 static unsigned long nr_unshown; 303 static unsigned long nr_unshown;
304 304
305 /* Don't complain about poisoned pages */ 305 /* Don't complain about poisoned pages */
306 if (PageHWPoison(page)) { 306 if (PageHWPoison(page)) {
307 page_mapcount_reset(page); /* remove PageBuddy */ 307 page_mapcount_reset(page); /* remove PageBuddy */
308 return; 308 return;
309 } 309 }
310 310
311 /* 311 /*
312 * Allow a burst of 60 reports, then keep quiet for that minute; 312 * Allow a burst of 60 reports, then keep quiet for that minute;
313 * or allow a steady drip of one report per second. 313 * or allow a steady drip of one report per second.
314 */ 314 */
315 if (nr_shown == 60) { 315 if (nr_shown == 60) {
316 if (time_before(jiffies, resume)) { 316 if (time_before(jiffies, resume)) {
317 nr_unshown++; 317 nr_unshown++;
318 goto out; 318 goto out;
319 } 319 }
320 if (nr_unshown) { 320 if (nr_unshown) {
321 printk(KERN_ALERT 321 printk(KERN_ALERT
322 "BUG: Bad page state: %lu messages suppressed\n", 322 "BUG: Bad page state: %lu messages suppressed\n",
323 nr_unshown); 323 nr_unshown);
324 nr_unshown = 0; 324 nr_unshown = 0;
325 } 325 }
326 nr_shown = 0; 326 nr_shown = 0;
327 } 327 }
328 if (nr_shown++ == 0) 328 if (nr_shown++ == 0)
329 resume = jiffies + 60 * HZ; 329 resume = jiffies + 60 * HZ;
330 330
331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
332 current->comm, page_to_pfn(page)); 332 current->comm, page_to_pfn(page));
333 dump_page(page); 333 dump_page(page);
334 334
335 print_modules(); 335 print_modules();
336 dump_stack(); 336 dump_stack();
337 out: 337 out:
338 /* Leave bad fields for debug, except PageBuddy could make trouble */ 338 /* Leave bad fields for debug, except PageBuddy could make trouble */
339 page_mapcount_reset(page); /* remove PageBuddy */ 339 page_mapcount_reset(page); /* remove PageBuddy */
340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
341 } 341 }
342 342
343 /* 343 /*
344 * Higher-order pages are called "compound pages". They are structured thusly: 344 * Higher-order pages are called "compound pages". They are structured thusly:
345 * 345 *
346 * The first PAGE_SIZE page is called the "head page". 346 * The first PAGE_SIZE page is called the "head page".
347 * 347 *
348 * The remaining PAGE_SIZE pages are called "tail pages". 348 * The remaining PAGE_SIZE pages are called "tail pages".
349 * 349 *
350 * All pages have PG_compound set. All tail pages have their ->first_page 350 * All pages have PG_compound set. All tail pages have their ->first_page
351 * pointing at the head page. 351 * pointing at the head page.
352 * 352 *
353 * The first tail page's ->lru.next holds the address of the compound page's 353 * The first tail page's ->lru.next holds the address of the compound page's
354 * put_page() function. Its ->lru.prev holds the order of allocation. 354 * put_page() function. Its ->lru.prev holds the order of allocation.
355 * This usage means that zero-order pages may not be compound. 355 * This usage means that zero-order pages may not be compound.
356 */ 356 */
357 357
358 static void free_compound_page(struct page *page) 358 static void free_compound_page(struct page *page)
359 { 359 {
360 __free_pages_ok(page, compound_order(page)); 360 __free_pages_ok(page, compound_order(page));
361 } 361 }
362 362
363 void prep_compound_page(struct page *page, unsigned long order) 363 void prep_compound_page(struct page *page, unsigned long order)
364 { 364 {
365 int i; 365 int i;
366 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
367 367
368 set_compound_page_dtor(page, free_compound_page); 368 set_compound_page_dtor(page, free_compound_page);
369 set_compound_order(page, order); 369 set_compound_order(page, order);
370 __SetPageHead(page); 370 __SetPageHead(page);
371 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
372 struct page *p = page + i; 372 struct page *p = page + i;
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 /* Make sure p->first_page is always valid for PageTail() */ 375 /* Make sure p->first_page is always valid for PageTail() */
376 smp_wmb(); 376 smp_wmb();
377 __SetPageTail(p); 377 __SetPageTail(p);
378 } 378 }
379 } 379 }
380 380
381 /* update __split_huge_page_refcount if you change this function */ 381 /* update __split_huge_page_refcount if you change this function */
382 static int destroy_compound_page(struct page *page, unsigned long order) 382 static int destroy_compound_page(struct page *page, unsigned long order)
383 { 383 {
384 int i; 384 int i;
385 int nr_pages = 1 << order; 385 int nr_pages = 1 << order;
386 int bad = 0; 386 int bad = 0;
387 387
388 if (unlikely(compound_order(page) != order)) { 388 if (unlikely(compound_order(page) != order)) {
389 bad_page(page); 389 bad_page(page);
390 bad++; 390 bad++;
391 } 391 }
392 392
393 __ClearPageHead(page); 393 __ClearPageHead(page);
394 394
395 for (i = 1; i < nr_pages; i++) { 395 for (i = 1; i < nr_pages; i++) {
396 struct page *p = page + i; 396 struct page *p = page + i;
397 397
398 if (unlikely(!PageTail(p) || (p->first_page != page))) { 398 if (unlikely(!PageTail(p) || (p->first_page != page))) {
399 bad_page(page); 399 bad_page(page);
400 bad++; 400 bad++;
401 } 401 }
402 __ClearPageTail(p); 402 __ClearPageTail(p);
403 } 403 }
404 404
405 return bad; 405 return bad;
406 } 406 }
407 407
408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
409 { 409 {
410 int i; 410 int i;
411 411
412 /* 412 /*
413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
414 * and __GFP_HIGHMEM from hard or soft interrupt context. 414 * and __GFP_HIGHMEM from hard or soft interrupt context.
415 */ 415 */
416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
417 for (i = 0; i < (1 << order); i++) 417 for (i = 0; i < (1 << order); i++)
418 clear_highpage(page + i); 418 clear_highpage(page + i);
419 } 419 }
420 420
421 #ifdef CONFIG_DEBUG_PAGEALLOC 421 #ifdef CONFIG_DEBUG_PAGEALLOC
422 unsigned int _debug_guardpage_minorder; 422 unsigned int _debug_guardpage_minorder;
423 423
424 static int __init debug_guardpage_minorder_setup(char *buf) 424 static int __init debug_guardpage_minorder_setup(char *buf)
425 { 425 {
426 unsigned long res; 426 unsigned long res;
427 427
428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
430 return 0; 430 return 0;
431 } 431 }
432 _debug_guardpage_minorder = res; 432 _debug_guardpage_minorder = res;
433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
434 return 0; 434 return 0;
435 } 435 }
436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
437 437
438 static inline void set_page_guard_flag(struct page *page) 438 static inline void set_page_guard_flag(struct page *page)
439 { 439 {
440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
441 } 441 }
442 442
443 static inline void clear_page_guard_flag(struct page *page) 443 static inline void clear_page_guard_flag(struct page *page)
444 { 444 {
445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
446 } 446 }
447 #else 447 #else
448 static inline void set_page_guard_flag(struct page *page) { } 448 static inline void set_page_guard_flag(struct page *page) { }
449 static inline void clear_page_guard_flag(struct page *page) { } 449 static inline void clear_page_guard_flag(struct page *page) { }
450 #endif 450 #endif
451 451
452 static inline void set_page_order(struct page *page, int order) 452 static inline void set_page_order(struct page *page, int order)
453 { 453 {
454 set_page_private(page, order); 454 set_page_private(page, order);
455 __SetPageBuddy(page); 455 __SetPageBuddy(page);
456 } 456 }
457 457
458 static inline void rmv_page_order(struct page *page) 458 static inline void rmv_page_order(struct page *page)
459 { 459 {
460 __ClearPageBuddy(page); 460 __ClearPageBuddy(page);
461 set_page_private(page, 0); 461 set_page_private(page, 0);
462 } 462 }
463 463
464 /* 464 /*
465 * Locate the struct page for both the matching buddy in our 465 * Locate the struct page for both the matching buddy in our
466 * pair (buddy1) and the combined O(n+1) page they form (page). 466 * pair (buddy1) and the combined O(n+1) page they form (page).
467 * 467 *
468 * 1) Any buddy B1 will have an order O twin B2 which satisfies 468 * 1) Any buddy B1 will have an order O twin B2 which satisfies
469 * the following equation: 469 * the following equation:
470 * B2 = B1 ^ (1 << O) 470 * B2 = B1 ^ (1 << O)
471 * For example, if the starting buddy (buddy2) is #8 its order 471 * For example, if the starting buddy (buddy2) is #8 its order
472 * 1 buddy is #10: 472 * 1 buddy is #10:
473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
474 * 474 *
475 * 2) Any buddy B will have an order O+1 parent P which 475 * 2) Any buddy B will have an order O+1 parent P which
476 * satisfies the following equation: 476 * satisfies the following equation:
477 * P = B & ~(1 << O) 477 * P = B & ~(1 << O)
478 * 478 *
479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
480 */ 480 */
481 static inline unsigned long 481 static inline unsigned long
482 __find_buddy_index(unsigned long page_idx, unsigned int order) 482 __find_buddy_index(unsigned long page_idx, unsigned int order)
483 { 483 {
484 return page_idx ^ (1 << order); 484 return page_idx ^ (1 << order);
485 } 485 }
486 486
487 /* 487 /*
488 * This function checks whether a page is free && is the buddy 488 * This function checks whether a page is free && is the buddy
489 * we can do coalesce a page and its buddy if 489 * we can do coalesce a page and its buddy if
490 * (a) the buddy is not in a hole && 490 * (a) the buddy is not in a hole &&
491 * (b) the buddy is in the buddy system && 491 * (b) the buddy is in the buddy system &&
492 * (c) a page and its buddy have the same order && 492 * (c) a page and its buddy have the same order &&
493 * (d) a page and its buddy are in the same zone. 493 * (d) a page and its buddy are in the same zone.
494 * 494 *
495 * For recording whether a page is in the buddy system, we set ->_mapcount 495 * For recording whether a page is in the buddy system, we set ->_mapcount
496 * PAGE_BUDDY_MAPCOUNT_VALUE. 496 * PAGE_BUDDY_MAPCOUNT_VALUE.
497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
498 * serialized by zone->lock. 498 * serialized by zone->lock.
499 * 499 *
500 * For recording page's order, we use page_private(page). 500 * For recording page's order, we use page_private(page).
501 */ 501 */
502 static inline int page_is_buddy(struct page *page, struct page *buddy, 502 static inline int page_is_buddy(struct page *page, struct page *buddy,
503 int order) 503 int order)
504 { 504 {
505 if (!pfn_valid_within(page_to_pfn(buddy))) 505 if (!pfn_valid_within(page_to_pfn(buddy)))
506 return 0; 506 return 0;
507 507
508 if (page_is_guard(buddy) && page_order(buddy) == order) { 508 if (page_is_guard(buddy) && page_order(buddy) == order) {
509 VM_BUG_ON(page_count(buddy) != 0); 509 VM_BUG_ON(page_count(buddy) != 0);
510 510
511 if (page_zone_id(page) != page_zone_id(buddy)) 511 if (page_zone_id(page) != page_zone_id(buddy))
512 return 0; 512 return 0;
513 513
514 return 1; 514 return 1;
515 } 515 }
516 516
517 if (PageBuddy(buddy) && page_order(buddy) == order) { 517 if (PageBuddy(buddy) && page_order(buddy) == order) {
518 VM_BUG_ON(page_count(buddy) != 0); 518 VM_BUG_ON(page_count(buddy) != 0);
519 519
520 /* 520 /*
521 * zone check is done late to avoid uselessly 521 * zone check is done late to avoid uselessly
522 * calculating zone/node ids for pages that could 522 * calculating zone/node ids for pages that could
523 * never merge. 523 * never merge.
524 */ 524 */
525 if (page_zone_id(page) != page_zone_id(buddy)) 525 if (page_zone_id(page) != page_zone_id(buddy))
526 return 0; 526 return 0;
527 527
528 return 1; 528 return 1;
529 } 529 }
530 return 0; 530 return 0;
531 } 531 }
532 532
533 /* 533 /*
534 * Freeing function for a buddy system allocator. 534 * Freeing function for a buddy system allocator.
535 * 535 *
536 * The concept of a buddy system is to maintain direct-mapped table 536 * The concept of a buddy system is to maintain direct-mapped table
537 * (containing bit values) for memory blocks of various "orders". 537 * (containing bit values) for memory blocks of various "orders".
538 * The bottom level table contains the map for the smallest allocatable 538 * The bottom level table contains the map for the smallest allocatable
539 * units of memory (here, pages), and each level above it describes 539 * units of memory (here, pages), and each level above it describes
540 * pairs of units from the levels below, hence, "buddies". 540 * pairs of units from the levels below, hence, "buddies".
541 * At a high level, all that happens here is marking the table entry 541 * At a high level, all that happens here is marking the table entry
542 * at the bottom level available, and propagating the changes upward 542 * at the bottom level available, and propagating the changes upward
543 * as necessary, plus some accounting needed to play nicely with other 543 * as necessary, plus some accounting needed to play nicely with other
544 * parts of the VM system. 544 * parts of the VM system.
545 * At each level, we keep a list of pages, which are heads of continuous 545 * At each level, we keep a list of pages, which are heads of continuous
546 * free pages of length of (1 << order) and marked with _mapcount 546 * free pages of length of (1 << order) and marked with _mapcount
547 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 547 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
548 * field. 548 * field.
549 * So when we are allocating or freeing one, we can derive the state of the 549 * So when we are allocating or freeing one, we can derive the state of the
550 * other. That is, if we allocate a small block, and both were 550 * other. That is, if we allocate a small block, and both were
551 * free, the remainder of the region must be split into blocks. 551 * free, the remainder of the region must be split into blocks.
552 * If a block is freed, and its buddy is also free, then this 552 * If a block is freed, and its buddy is also free, then this
553 * triggers coalescing into a block of larger size. 553 * triggers coalescing into a block of larger size.
554 * 554 *
555 * -- nyc 555 * -- nyc
556 */ 556 */
557 557
558 static inline void __free_one_page(struct page *page, 558 static inline void __free_one_page(struct page *page,
559 struct zone *zone, unsigned int order, 559 struct zone *zone, unsigned int order,
560 int migratetype) 560 int migratetype)
561 { 561 {
562 unsigned long page_idx; 562 unsigned long page_idx;
563 unsigned long combined_idx; 563 unsigned long combined_idx;
564 unsigned long uninitialized_var(buddy_idx); 564 unsigned long uninitialized_var(buddy_idx);
565 struct page *buddy; 565 struct page *buddy;
566 566
567 VM_BUG_ON(!zone_is_initialized(zone)); 567 VM_BUG_ON(!zone_is_initialized(zone));
568 568
569 if (unlikely(PageCompound(page))) 569 if (unlikely(PageCompound(page)))
570 if (unlikely(destroy_compound_page(page, order))) 570 if (unlikely(destroy_compound_page(page, order)))
571 return; 571 return;
572 572
573 VM_BUG_ON(migratetype == -1); 573 VM_BUG_ON(migratetype == -1);
574 574
575 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 575 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
576 576
577 VM_BUG_ON(page_idx & ((1 << order) - 1)); 577 VM_BUG_ON(page_idx & ((1 << order) - 1));
578 VM_BUG_ON(bad_range(zone, page)); 578 VM_BUG_ON(bad_range(zone, page));
579 579
580 while (order < MAX_ORDER-1) { 580 while (order < MAX_ORDER-1) {
581 buddy_idx = __find_buddy_index(page_idx, order); 581 buddy_idx = __find_buddy_index(page_idx, order);
582 buddy = page + (buddy_idx - page_idx); 582 buddy = page + (buddy_idx - page_idx);
583 if (!page_is_buddy(page, buddy, order)) 583 if (!page_is_buddy(page, buddy, order))
584 break; 584 break;
585 /* 585 /*
586 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 586 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
587 * merge with it and move up one order. 587 * merge with it and move up one order.
588 */ 588 */
589 if (page_is_guard(buddy)) { 589 if (page_is_guard(buddy)) {
590 clear_page_guard_flag(buddy); 590 clear_page_guard_flag(buddy);
591 set_page_private(page, 0); 591 set_page_private(page, 0);
592 __mod_zone_freepage_state(zone, 1 << order, 592 __mod_zone_freepage_state(zone, 1 << order,
593 migratetype); 593 migratetype);
594 } else { 594 } else {
595 list_del(&buddy->lru); 595 list_del(&buddy->lru);
596 zone->free_area[order].nr_free--; 596 zone->free_area[order].nr_free--;
597 rmv_page_order(buddy); 597 rmv_page_order(buddy);
598 } 598 }
599 combined_idx = buddy_idx & page_idx; 599 combined_idx = buddy_idx & page_idx;
600 page = page + (combined_idx - page_idx); 600 page = page + (combined_idx - page_idx);
601 page_idx = combined_idx; 601 page_idx = combined_idx;
602 order++; 602 order++;
603 } 603 }
604 set_page_order(page, order); 604 set_page_order(page, order);
605 605
606 /* 606 /*
607 * If this is not the largest possible page, check if the buddy 607 * If this is not the largest possible page, check if the buddy
608 * of the next-highest order is free. If it is, it's possible 608 * of the next-highest order is free. If it is, it's possible
609 * that pages are being freed that will coalesce soon. In case, 609 * that pages are being freed that will coalesce soon. In case,
610 * that is happening, add the free page to the tail of the list 610 * that is happening, add the free page to the tail of the list
611 * so it's less likely to be used soon and more likely to be merged 611 * so it's less likely to be used soon and more likely to be merged
612 * as a higher order page 612 * as a higher order page
613 */ 613 */
614 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 614 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
615 struct page *higher_page, *higher_buddy; 615 struct page *higher_page, *higher_buddy;
616 combined_idx = buddy_idx & page_idx; 616 combined_idx = buddy_idx & page_idx;
617 higher_page = page + (combined_idx - page_idx); 617 higher_page = page + (combined_idx - page_idx);
618 buddy_idx = __find_buddy_index(combined_idx, order + 1); 618 buddy_idx = __find_buddy_index(combined_idx, order + 1);
619 higher_buddy = higher_page + (buddy_idx - combined_idx); 619 higher_buddy = higher_page + (buddy_idx - combined_idx);
620 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 620 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
621 list_add_tail(&page->lru, 621 list_add_tail(&page->lru,
622 &zone->free_area[order].free_list[migratetype]); 622 &zone->free_area[order].free_list[migratetype]);
623 goto out; 623 goto out;
624 } 624 }
625 } 625 }
626 626
627 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 627 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
628 out: 628 out:
629 zone->free_area[order].nr_free++; 629 zone->free_area[order].nr_free++;
630 } 630 }
631 631
632 static inline int free_pages_check(struct page *page) 632 static inline int free_pages_check(struct page *page)
633 { 633 {
634 if (unlikely(page_mapcount(page) | 634 if (unlikely(page_mapcount(page) |
635 (page->mapping != NULL) | 635 (page->mapping != NULL) |
636 (atomic_read(&page->_count) != 0) | 636 (atomic_read(&page->_count) != 0) |
637 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 637 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
638 (mem_cgroup_bad_page_check(page)))) { 638 (mem_cgroup_bad_page_check(page)))) {
639 bad_page(page); 639 bad_page(page);
640 return 1; 640 return 1;
641 } 641 }
642 page_nid_reset_last(page); 642 page_nid_reset_last(page);
643 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 643 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
644 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 644 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
645 return 0; 645 return 0;
646 } 646 }
647 647
648 /* 648 /*
649 * Frees a number of pages from the PCP lists 649 * Frees a number of pages from the PCP lists
650 * Assumes all pages on list are in same zone, and of same order. 650 * Assumes all pages on list are in same zone, and of same order.
651 * count is the number of pages to free. 651 * count is the number of pages to free.
652 * 652 *
653 * If the zone was previously in an "all pages pinned" state then look to 653 * If the zone was previously in an "all pages pinned" state then look to
654 * see if this freeing clears that state. 654 * see if this freeing clears that state.
655 * 655 *
656 * And clear the zone's pages_scanned counter, to hold off the "all pages are 656 * And clear the zone's pages_scanned counter, to hold off the "all pages are
657 * pinned" detection logic. 657 * pinned" detection logic.
658 */ 658 */
659 static void free_pcppages_bulk(struct zone *zone, int count, 659 static void free_pcppages_bulk(struct zone *zone, int count,
660 struct per_cpu_pages *pcp) 660 struct per_cpu_pages *pcp)
661 { 661 {
662 int migratetype = 0; 662 int migratetype = 0;
663 int batch_free = 0; 663 int batch_free = 0;
664 int to_free = count; 664 int to_free = count;
665 665
666 spin_lock(&zone->lock); 666 spin_lock(&zone->lock);
667 zone->pages_scanned = 0; 667 zone->pages_scanned = 0;
668 668
669 while (to_free) { 669 while (to_free) {
670 struct page *page; 670 struct page *page;
671 struct list_head *list; 671 struct list_head *list;
672 672
673 /* 673 /*
674 * Remove pages from lists in a round-robin fashion. A 674 * Remove pages from lists in a round-robin fashion. A
675 * batch_free count is maintained that is incremented when an 675 * batch_free count is maintained that is incremented when an
676 * empty list is encountered. This is so more pages are freed 676 * empty list is encountered. This is so more pages are freed
677 * off fuller lists instead of spinning excessively around empty 677 * off fuller lists instead of spinning excessively around empty
678 * lists 678 * lists
679 */ 679 */
680 do { 680 do {
681 batch_free++; 681 batch_free++;
682 if (++migratetype == MIGRATE_PCPTYPES) 682 if (++migratetype == MIGRATE_PCPTYPES)
683 migratetype = 0; 683 migratetype = 0;
684 list = &pcp->lists[migratetype]; 684 list = &pcp->lists[migratetype];
685 } while (list_empty(list)); 685 } while (list_empty(list));
686 686
687 /* This is the only non-empty list. Free them all. */ 687 /* This is the only non-empty list. Free them all. */
688 if (batch_free == MIGRATE_PCPTYPES) 688 if (batch_free == MIGRATE_PCPTYPES)
689 batch_free = to_free; 689 batch_free = to_free;
690 690
691 do { 691 do {
692 int mt; /* migratetype of the to-be-freed page */ 692 int mt; /* migratetype of the to-be-freed page */
693 693
694 page = list_entry(list->prev, struct page, lru); 694 page = list_entry(list->prev, struct page, lru);
695 /* must delete as __free_one_page list manipulates */ 695 /* must delete as __free_one_page list manipulates */
696 list_del(&page->lru); 696 list_del(&page->lru);
697 mt = get_freepage_migratetype(page); 697 mt = get_freepage_migratetype(page);
698 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 698 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
699 __free_one_page(page, zone, 0, mt); 699 __free_one_page(page, zone, 0, mt);
700 trace_mm_page_pcpu_drain(page, 0, mt); 700 trace_mm_page_pcpu_drain(page, 0, mt);
701 if (likely(!is_migrate_isolate_page(page))) { 701 if (likely(!is_migrate_isolate_page(page))) {
702 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 702 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
703 if (is_migrate_cma(mt)) 703 if (is_migrate_cma(mt))
704 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 704 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
705 } 705 }
706 } while (--to_free && --batch_free && !list_empty(list)); 706 } while (--to_free && --batch_free && !list_empty(list));
707 } 707 }
708 spin_unlock(&zone->lock); 708 spin_unlock(&zone->lock);
709 } 709 }
710 710
711 static void free_one_page(struct zone *zone, struct page *page, int order, 711 static void free_one_page(struct zone *zone, struct page *page, int order,
712 int migratetype) 712 int migratetype)
713 { 713 {
714 spin_lock(&zone->lock); 714 spin_lock(&zone->lock);
715 zone->pages_scanned = 0; 715 zone->pages_scanned = 0;
716 716
717 __free_one_page(page, zone, order, migratetype); 717 __free_one_page(page, zone, order, migratetype);
718 if (unlikely(!is_migrate_isolate(migratetype))) 718 if (unlikely(!is_migrate_isolate(migratetype)))
719 __mod_zone_freepage_state(zone, 1 << order, migratetype); 719 __mod_zone_freepage_state(zone, 1 << order, migratetype);
720 spin_unlock(&zone->lock); 720 spin_unlock(&zone->lock);
721 } 721 }
722 722
723 static bool free_pages_prepare(struct page *page, unsigned int order) 723 static bool free_pages_prepare(struct page *page, unsigned int order)
724 { 724 {
725 int i; 725 int i;
726 int bad = 0; 726 int bad = 0;
727 727
728 trace_mm_page_free(page, order); 728 trace_mm_page_free(page, order);
729 kmemcheck_free_shadow(page, order); 729 kmemcheck_free_shadow(page, order);
730 730
731 if (PageAnon(page)) 731 if (PageAnon(page))
732 page->mapping = NULL; 732 page->mapping = NULL;
733 for (i = 0; i < (1 << order); i++) 733 for (i = 0; i < (1 << order); i++)
734 bad += free_pages_check(page + i); 734 bad += free_pages_check(page + i);
735 if (bad) 735 if (bad)
736 return false; 736 return false;
737 737
738 if (!PageHighMem(page)) { 738 if (!PageHighMem(page)) {
739 debug_check_no_locks_freed(page_address(page), 739 debug_check_no_locks_freed(page_address(page),
740 PAGE_SIZE << order); 740 PAGE_SIZE << order);
741 debug_check_no_obj_freed(page_address(page), 741 debug_check_no_obj_freed(page_address(page),
742 PAGE_SIZE << order); 742 PAGE_SIZE << order);
743 } 743 }
744 arch_free_page(page, order); 744 arch_free_page(page, order);
745 kernel_map_pages(page, 1 << order, 0); 745 kernel_map_pages(page, 1 << order, 0);
746 746
747 return true; 747 return true;
748 } 748 }
749 749
750 static void __free_pages_ok(struct page *page, unsigned int order) 750 static void __free_pages_ok(struct page *page, unsigned int order)
751 { 751 {
752 unsigned long flags; 752 unsigned long flags;
753 int migratetype; 753 int migratetype;
754 754
755 if (!free_pages_prepare(page, order)) 755 if (!free_pages_prepare(page, order))
756 return; 756 return;
757 757
758 local_irq_save(flags); 758 local_irq_save(flags);
759 __count_vm_events(PGFREE, 1 << order); 759 __count_vm_events(PGFREE, 1 << order);
760 migratetype = get_pageblock_migratetype(page); 760 migratetype = get_pageblock_migratetype(page);
761 set_freepage_migratetype(page, migratetype); 761 set_freepage_migratetype(page, migratetype);
762 free_one_page(page_zone(page), page, order, migratetype); 762 free_one_page(page_zone(page), page, order, migratetype);
763 local_irq_restore(flags); 763 local_irq_restore(flags);
764 } 764 }
765 765
766 void __init __free_pages_bootmem(struct page *page, unsigned int order) 766 void __init __free_pages_bootmem(struct page *page, unsigned int order)
767 { 767 {
768 unsigned int nr_pages = 1 << order; 768 unsigned int nr_pages = 1 << order;
769 struct page *p = page; 769 struct page *p = page;
770 unsigned int loop; 770 unsigned int loop;
771 771
772 prefetchw(p); 772 prefetchw(p);
773 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 773 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
774 prefetchw(p + 1); 774 prefetchw(p + 1);
775 __ClearPageReserved(p); 775 __ClearPageReserved(p);
776 set_page_count(p, 0); 776 set_page_count(p, 0);
777 } 777 }
778 __ClearPageReserved(p); 778 __ClearPageReserved(p);
779 set_page_count(p, 0); 779 set_page_count(p, 0);
780 780
781 page_zone(page)->managed_pages += nr_pages; 781 page_zone(page)->managed_pages += nr_pages;
782 set_page_refcounted(page); 782 set_page_refcounted(page);
783 __free_pages(page, order); 783 __free_pages(page, order);
784 } 784 }
785 785
786 #ifdef CONFIG_CMA 786 #ifdef CONFIG_CMA
787 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 787 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
788 void __init init_cma_reserved_pageblock(struct page *page) 788 void __init init_cma_reserved_pageblock(struct page *page)
789 { 789 {
790 unsigned i = pageblock_nr_pages; 790 unsigned i = pageblock_nr_pages;
791 struct page *p = page; 791 struct page *p = page;
792 792
793 do { 793 do {
794 __ClearPageReserved(p); 794 __ClearPageReserved(p);
795 set_page_count(p, 0); 795 set_page_count(p, 0);
796 } while (++p, --i); 796 } while (++p, --i);
797 797
798 set_pageblock_migratetype(page, MIGRATE_CMA); 798 set_pageblock_migratetype(page, MIGRATE_CMA);
799 799
800 if (pageblock_order >= MAX_ORDER) { 800 if (pageblock_order >= MAX_ORDER) {
801 i = pageblock_nr_pages; 801 i = pageblock_nr_pages;
802 p = page; 802 p = page;
803 do { 803 do {
804 set_page_refcounted(p); 804 set_page_refcounted(p);
805 __free_pages(p, MAX_ORDER - 1); 805 __free_pages(p, MAX_ORDER - 1);
806 p += MAX_ORDER_NR_PAGES; 806 p += MAX_ORDER_NR_PAGES;
807 } while (i -= MAX_ORDER_NR_PAGES); 807 } while (i -= MAX_ORDER_NR_PAGES);
808 } else { 808 } else {
809 set_page_refcounted(page); 809 set_page_refcounted(page);
810 __free_pages(page, pageblock_order); 810 __free_pages(page, pageblock_order);
811 } 811 }
812 812
813 adjust_managed_page_count(page, pageblock_nr_pages); 813 adjust_managed_page_count(page, pageblock_nr_pages);
814 } 814 }
815 #endif 815 #endif
816 816
817 /* 817 /*
818 * The order of subdivision here is critical for the IO subsystem. 818 * The order of subdivision here is critical for the IO subsystem.
819 * Please do not alter this order without good reasons and regression 819 * Please do not alter this order without good reasons and regression
820 * testing. Specifically, as large blocks of memory are subdivided, 820 * testing. Specifically, as large blocks of memory are subdivided,
821 * the order in which smaller blocks are delivered depends on the order 821 * the order in which smaller blocks are delivered depends on the order
822 * they're subdivided in this function. This is the primary factor 822 * they're subdivided in this function. This is the primary factor
823 * influencing the order in which pages are delivered to the IO 823 * influencing the order in which pages are delivered to the IO
824 * subsystem according to empirical testing, and this is also justified 824 * subsystem according to empirical testing, and this is also justified
825 * by considering the behavior of a buddy system containing a single 825 * by considering the behavior of a buddy system containing a single
826 * large block of memory acted on by a series of small allocations. 826 * large block of memory acted on by a series of small allocations.
827 * This behavior is a critical factor in sglist merging's success. 827 * This behavior is a critical factor in sglist merging's success.
828 * 828 *
829 * -- nyc 829 * -- nyc
830 */ 830 */
831 static inline void expand(struct zone *zone, struct page *page, 831 static inline void expand(struct zone *zone, struct page *page,
832 int low, int high, struct free_area *area, 832 int low, int high, struct free_area *area,
833 int migratetype) 833 int migratetype)
834 { 834 {
835 unsigned long size = 1 << high; 835 unsigned long size = 1 << high;
836 836
837 while (high > low) { 837 while (high > low) {
838 area--; 838 area--;
839 high--; 839 high--;
840 size >>= 1; 840 size >>= 1;
841 VM_BUG_ON(bad_range(zone, &page[size])); 841 VM_BUG_ON(bad_range(zone, &page[size]));
842 842
843 #ifdef CONFIG_DEBUG_PAGEALLOC 843 #ifdef CONFIG_DEBUG_PAGEALLOC
844 if (high < debug_guardpage_minorder()) { 844 if (high < debug_guardpage_minorder()) {
845 /* 845 /*
846 * Mark as guard pages (or page), that will allow to 846 * Mark as guard pages (or page), that will allow to
847 * merge back to allocator when buddy will be freed. 847 * merge back to allocator when buddy will be freed.
848 * Corresponding page table entries will not be touched, 848 * Corresponding page table entries will not be touched,
849 * pages will stay not present in virtual address space 849 * pages will stay not present in virtual address space
850 */ 850 */
851 INIT_LIST_HEAD(&page[size].lru); 851 INIT_LIST_HEAD(&page[size].lru);
852 set_page_guard_flag(&page[size]); 852 set_page_guard_flag(&page[size]);
853 set_page_private(&page[size], high); 853 set_page_private(&page[size], high);
854 /* Guard pages are not available for any usage */ 854 /* Guard pages are not available for any usage */
855 __mod_zone_freepage_state(zone, -(1 << high), 855 __mod_zone_freepage_state(zone, -(1 << high),
856 migratetype); 856 migratetype);
857 continue; 857 continue;
858 } 858 }
859 #endif 859 #endif
860 list_add(&page[size].lru, &area->free_list[migratetype]); 860 list_add(&page[size].lru, &area->free_list[migratetype]);
861 area->nr_free++; 861 area->nr_free++;
862 set_page_order(&page[size], high); 862 set_page_order(&page[size], high);
863 } 863 }
864 } 864 }
865 865
866 /* 866 /*
867 * This page is about to be returned from the page allocator 867 * This page is about to be returned from the page allocator
868 */ 868 */
869 static inline int check_new_page(struct page *page) 869 static inline int check_new_page(struct page *page)
870 { 870 {
871 if (unlikely(page_mapcount(page) | 871 if (unlikely(page_mapcount(page) |
872 (page->mapping != NULL) | 872 (page->mapping != NULL) |
873 (atomic_read(&page->_count) != 0) | 873 (atomic_read(&page->_count) != 0) |
874 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 874 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
875 (mem_cgroup_bad_page_check(page)))) { 875 (mem_cgroup_bad_page_check(page)))) {
876 bad_page(page); 876 bad_page(page);
877 return 1; 877 return 1;
878 } 878 }
879 return 0; 879 return 0;
880 } 880 }
881 881
882 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 882 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
883 { 883 {
884 int i; 884 int i;
885 885
886 for (i = 0; i < (1 << order); i++) { 886 for (i = 0; i < (1 << order); i++) {
887 struct page *p = page + i; 887 struct page *p = page + i;
888 if (unlikely(check_new_page(p))) 888 if (unlikely(check_new_page(p)))
889 return 1; 889 return 1;
890 } 890 }
891 891
892 set_page_private(page, 0); 892 set_page_private(page, 0);
893 set_page_refcounted(page); 893 set_page_refcounted(page);
894 894
895 arch_alloc_page(page, order); 895 arch_alloc_page(page, order);
896 kernel_map_pages(page, 1 << order, 1); 896 kernel_map_pages(page, 1 << order, 1);
897 897
898 if (gfp_flags & __GFP_ZERO) 898 if (gfp_flags & __GFP_ZERO)
899 prep_zero_page(page, order, gfp_flags); 899 prep_zero_page(page, order, gfp_flags);
900 900
901 if (order && (gfp_flags & __GFP_COMP)) 901 if (order && (gfp_flags & __GFP_COMP))
902 prep_compound_page(page, order); 902 prep_compound_page(page, order);
903 903
904 return 0; 904 return 0;
905 } 905 }
906 906
907 /* 907 /*
908 * Go through the free lists for the given migratetype and remove 908 * Go through the free lists for the given migratetype and remove
909 * the smallest available page from the freelists 909 * the smallest available page from the freelists
910 */ 910 */
911 static inline 911 static inline
912 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 912 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
913 int migratetype) 913 int migratetype)
914 { 914 {
915 unsigned int current_order; 915 unsigned int current_order;
916 struct free_area *area; 916 struct free_area *area;
917 struct page *page; 917 struct page *page;
918 918
919 /* Find a page of the appropriate size in the preferred list */ 919 /* Find a page of the appropriate size in the preferred list */
920 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 920 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
921 area = &(zone->free_area[current_order]); 921 area = &(zone->free_area[current_order]);
922 if (list_empty(&area->free_list[migratetype])) 922 if (list_empty(&area->free_list[migratetype]))
923 continue; 923 continue;
924 924
925 page = list_entry(area->free_list[migratetype].next, 925 page = list_entry(area->free_list[migratetype].next,
926 struct page, lru); 926 struct page, lru);
927 list_del(&page->lru); 927 list_del(&page->lru);
928 rmv_page_order(page); 928 rmv_page_order(page);
929 area->nr_free--; 929 area->nr_free--;
930 expand(zone, page, order, current_order, area, migratetype); 930 expand(zone, page, order, current_order, area, migratetype);
931 set_freepage_migratetype(page, migratetype); 931 set_freepage_migratetype(page, migratetype);
932 return page; 932 return page;
933 } 933 }
934 934
935 return NULL; 935 return NULL;
936 } 936 }
937 937
938 938
939 /* 939 /*
940 * This array describes the order lists are fallen back to when 940 * This array describes the order lists are fallen back to when
941 * the free lists for the desirable migrate type are depleted 941 * the free lists for the desirable migrate type are depleted
942 */ 942 */
943 static int fallbacks[MIGRATE_TYPES][4] = { 943 static int fallbacks[MIGRATE_TYPES][4] = {
944 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 944 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
945 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 945 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
946 #ifdef CONFIG_CMA 946 #ifdef CONFIG_CMA
947 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 947 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
948 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 948 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
949 #else 949 #else
950 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 950 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
951 #endif 951 #endif
952 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 952 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
953 #ifdef CONFIG_MEMORY_ISOLATION 953 #ifdef CONFIG_MEMORY_ISOLATION
954 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 954 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
955 #endif 955 #endif
956 }; 956 };
957 957
958 /* 958 /*
959 * Move the free pages in a range to the free lists of the requested type. 959 * Move the free pages in a range to the free lists of the requested type.
960 * Note that start_page and end_pages are not aligned on a pageblock 960 * Note that start_page and end_pages are not aligned on a pageblock
961 * boundary. If alignment is required, use move_freepages_block() 961 * boundary. If alignment is required, use move_freepages_block()
962 */ 962 */
963 int move_freepages(struct zone *zone, 963 int move_freepages(struct zone *zone,
964 struct page *start_page, struct page *end_page, 964 struct page *start_page, struct page *end_page,
965 int migratetype) 965 int migratetype)
966 { 966 {
967 struct page *page; 967 struct page *page;
968 unsigned long order; 968 unsigned long order;
969 int pages_moved = 0; 969 int pages_moved = 0;
970 970
971 #ifndef CONFIG_HOLES_IN_ZONE 971 #ifndef CONFIG_HOLES_IN_ZONE
972 /* 972 /*
973 * page_zone is not safe to call in this context when 973 * page_zone is not safe to call in this context when
974 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 974 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
975 * anyway as we check zone boundaries in move_freepages_block(). 975 * anyway as we check zone boundaries in move_freepages_block().
976 * Remove at a later date when no bug reports exist related to 976 * Remove at a later date when no bug reports exist related to
977 * grouping pages by mobility 977 * grouping pages by mobility
978 */ 978 */
979 BUG_ON(page_zone(start_page) != page_zone(end_page)); 979 BUG_ON(page_zone(start_page) != page_zone(end_page));
980 #endif 980 #endif
981 981
982 for (page = start_page; page <= end_page;) { 982 for (page = start_page; page <= end_page;) {
983 /* Make sure we are not inadvertently changing nodes */ 983 /* Make sure we are not inadvertently changing nodes */
984 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 984 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
985 985
986 if (!pfn_valid_within(page_to_pfn(page))) { 986 if (!pfn_valid_within(page_to_pfn(page))) {
987 page++; 987 page++;
988 continue; 988 continue;
989 } 989 }
990 990
991 if (!PageBuddy(page)) { 991 if (!PageBuddy(page)) {
992 page++; 992 page++;
993 continue; 993 continue;
994 } 994 }
995 995
996 order = page_order(page); 996 order = page_order(page);
997 list_move(&page->lru, 997 list_move(&page->lru,
998 &zone->free_area[order].free_list[migratetype]); 998 &zone->free_area[order].free_list[migratetype]);
999 set_freepage_migratetype(page, migratetype); 999 set_freepage_migratetype(page, migratetype);
1000 page += 1 << order; 1000 page += 1 << order;
1001 pages_moved += 1 << order; 1001 pages_moved += 1 << order;
1002 } 1002 }
1003 1003
1004 return pages_moved; 1004 return pages_moved;
1005 } 1005 }
1006 1006
1007 int move_freepages_block(struct zone *zone, struct page *page, 1007 int move_freepages_block(struct zone *zone, struct page *page,
1008 int migratetype) 1008 int migratetype)
1009 { 1009 {
1010 unsigned long start_pfn, end_pfn; 1010 unsigned long start_pfn, end_pfn;
1011 struct page *start_page, *end_page; 1011 struct page *start_page, *end_page;
1012 1012
1013 start_pfn = page_to_pfn(page); 1013 start_pfn = page_to_pfn(page);
1014 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1014 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1015 start_page = pfn_to_page(start_pfn); 1015 start_page = pfn_to_page(start_pfn);
1016 end_page = start_page + pageblock_nr_pages - 1; 1016 end_page = start_page + pageblock_nr_pages - 1;
1017 end_pfn = start_pfn + pageblock_nr_pages - 1; 1017 end_pfn = start_pfn + pageblock_nr_pages - 1;
1018 1018
1019 /* Do not cross zone boundaries */ 1019 /* Do not cross zone boundaries */
1020 if (!zone_spans_pfn(zone, start_pfn)) 1020 if (!zone_spans_pfn(zone, start_pfn))
1021 start_page = page; 1021 start_page = page;
1022 if (!zone_spans_pfn(zone, end_pfn)) 1022 if (!zone_spans_pfn(zone, end_pfn))
1023 return 0; 1023 return 0;
1024 1024
1025 return move_freepages(zone, start_page, end_page, migratetype); 1025 return move_freepages(zone, start_page, end_page, migratetype);
1026 } 1026 }
1027 1027
1028 static void change_pageblock_range(struct page *pageblock_page, 1028 static void change_pageblock_range(struct page *pageblock_page,
1029 int start_order, int migratetype) 1029 int start_order, int migratetype)
1030 { 1030 {
1031 int nr_pageblocks = 1 << (start_order - pageblock_order); 1031 int nr_pageblocks = 1 << (start_order - pageblock_order);
1032 1032
1033 while (nr_pageblocks--) { 1033 while (nr_pageblocks--) {
1034 set_pageblock_migratetype(pageblock_page, migratetype); 1034 set_pageblock_migratetype(pageblock_page, migratetype);
1035 pageblock_page += pageblock_nr_pages; 1035 pageblock_page += pageblock_nr_pages;
1036 } 1036 }
1037 } 1037 }
1038 1038
1039 /* 1039 /*
1040 * If breaking a large block of pages, move all free pages to the preferred 1040 * If breaking a large block of pages, move all free pages to the preferred
1041 * allocation list. If falling back for a reclaimable kernel allocation, be 1041 * allocation list. If falling back for a reclaimable kernel allocation, be
1042 * more aggressive about taking ownership of free pages. 1042 * more aggressive about taking ownership of free pages.
1043 * 1043 *
1044 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1044 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1045 * nor move CMA pages to different free lists. We don't want unmovable pages 1045 * nor move CMA pages to different free lists. We don't want unmovable pages
1046 * to be allocated from MIGRATE_CMA areas. 1046 * to be allocated from MIGRATE_CMA areas.
1047 * 1047 *
1048 * Returns the new migratetype of the pageblock (or the same old migratetype 1048 * Returns the new migratetype of the pageblock (or the same old migratetype
1049 * if it was unchanged). 1049 * if it was unchanged).
1050 */ 1050 */
1051 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1051 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1052 int start_type, int fallback_type) 1052 int start_type, int fallback_type)
1053 { 1053 {
1054 int current_order = page_order(page); 1054 int current_order = page_order(page);
1055 1055
1056 /* 1056 /*
1057 * When borrowing from MIGRATE_CMA, we need to release the excess 1057 * When borrowing from MIGRATE_CMA, we need to release the excess
1058 * buddy pages to CMA itself. We also ensure the freepage_migratetype 1058 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1059 * is set to CMA so it is returned to the correct freelist in case 1059 * is set to CMA so it is returned to the correct freelist in case
1060 * the page ends up being not actually allocated from the pcp lists. 1060 * the page ends up being not actually allocated from the pcp lists.
1061 */ 1061 */
1062 if (is_migrate_cma(fallback_type)) 1062 if (is_migrate_cma(fallback_type))
1063 return fallback_type; 1063 return fallback_type;
1064 1064
1065 /* Take ownership for orders >= pageblock_order */ 1065 /* Take ownership for orders >= pageblock_order */
1066 if (current_order >= pageblock_order) { 1066 if (current_order >= pageblock_order) {
1067 change_pageblock_range(page, current_order, start_type); 1067 change_pageblock_range(page, current_order, start_type);
1068 return start_type; 1068 return start_type;
1069 } 1069 }
1070 1070
1071 if (current_order >= pageblock_order / 2 || 1071 if (current_order >= pageblock_order / 2 ||
1072 start_type == MIGRATE_RECLAIMABLE || 1072 start_type == MIGRATE_RECLAIMABLE ||
1073 page_group_by_mobility_disabled) { 1073 page_group_by_mobility_disabled) {
1074 int pages; 1074 int pages;
1075 1075
1076 pages = move_freepages_block(zone, page, start_type); 1076 pages = move_freepages_block(zone, page, start_type);
1077 1077
1078 /* Claim the whole block if over half of it is free */ 1078 /* Claim the whole block if over half of it is free */
1079 if (pages >= (1 << (pageblock_order-1)) || 1079 if (pages >= (1 << (pageblock_order-1)) ||
1080 page_group_by_mobility_disabled) { 1080 page_group_by_mobility_disabled) {
1081 1081
1082 set_pageblock_migratetype(page, start_type); 1082 set_pageblock_migratetype(page, start_type);
1083 return start_type; 1083 return start_type;
1084 } 1084 }
1085 1085
1086 } 1086 }
1087 1087
1088 return fallback_type; 1088 return fallback_type;
1089 } 1089 }
1090 1090
1091 /* Remove an element from the buddy allocator from the fallback list */ 1091 /* Remove an element from the buddy allocator from the fallback list */
1092 static inline struct page * 1092 static inline struct page *
1093 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1093 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1094 { 1094 {
1095 struct free_area *area; 1095 struct free_area *area;
1096 int current_order; 1096 int current_order;
1097 struct page *page; 1097 struct page *page;
1098 int migratetype, new_type, i; 1098 int migratetype, new_type, i;
1099 1099
1100 /* Find the largest possible block of pages in the other list */ 1100 /* Find the largest possible block of pages in the other list */
1101 for (current_order = MAX_ORDER-1; current_order >= order; 1101 for (current_order = MAX_ORDER-1; current_order >= order;
1102 --current_order) { 1102 --current_order) {
1103 for (i = 0;; i++) { 1103 for (i = 0;; i++) {
1104 migratetype = fallbacks[start_migratetype][i]; 1104 migratetype = fallbacks[start_migratetype][i];
1105 1105
1106 /* MIGRATE_RESERVE handled later if necessary */ 1106 /* MIGRATE_RESERVE handled later if necessary */
1107 if (migratetype == MIGRATE_RESERVE) 1107 if (migratetype == MIGRATE_RESERVE)
1108 break; 1108 break;
1109 1109
1110 area = &(zone->free_area[current_order]); 1110 area = &(zone->free_area[current_order]);
1111 if (list_empty(&area->free_list[migratetype])) 1111 if (list_empty(&area->free_list[migratetype]))
1112 continue; 1112 continue;
1113 1113
1114 page = list_entry(area->free_list[migratetype].next, 1114 page = list_entry(area->free_list[migratetype].next,
1115 struct page, lru); 1115 struct page, lru);
1116 area->nr_free--; 1116 area->nr_free--;
1117 1117
1118 new_type = try_to_steal_freepages(zone, page, 1118 new_type = try_to_steal_freepages(zone, page,
1119 start_migratetype, 1119 start_migratetype,
1120 migratetype); 1120 migratetype);
1121 1121
1122 /* Remove the page from the freelists */ 1122 /* Remove the page from the freelists */
1123 list_del(&page->lru); 1123 list_del(&page->lru);
1124 rmv_page_order(page); 1124 rmv_page_order(page);
1125 1125
1126 expand(zone, page, order, current_order, area, 1126 expand(zone, page, order, current_order, area,
1127 new_type); 1127 new_type);
1128 /* The freepage_migratetype may differ from pageblock's 1128 /* The freepage_migratetype may differ from pageblock's
1129 * migratetype depending on the decisions in 1129 * migratetype depending on the decisions in
1130 * try_to_steal_freepages. This is OK as long as it does 1130 * try_to_steal_freepages. This is OK as long as it does
1131 * not differ for MIGRATE_CMA type. 1131 * not differ for MIGRATE_CMA type.
1132 */ 1132 */
1133 set_freepage_migratetype(page, new_type); 1133 set_freepage_migratetype(page, new_type);
1134 1134
1135 trace_mm_page_alloc_extfrag(page, order, current_order, 1135 trace_mm_page_alloc_extfrag(page, order, current_order,
1136 start_migratetype, migratetype, new_type); 1136 start_migratetype, migratetype, new_type);
1137 1137
1138 return page; 1138 return page;
1139 } 1139 }
1140 } 1140 }
1141 1141
1142 return NULL; 1142 return NULL;
1143 } 1143 }
1144 1144
1145 /* 1145 /*
1146 * Do the hard work of removing an element from the buddy allocator. 1146 * Do the hard work of removing an element from the buddy allocator.
1147 * Call me with the zone->lock already held. 1147 * Call me with the zone->lock already held.
1148 */ 1148 */
1149 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1149 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1150 int migratetype) 1150 int migratetype)
1151 { 1151 {
1152 struct page *page; 1152 struct page *page;
1153 1153
1154 retry_reserve: 1154 retry_reserve:
1155 page = __rmqueue_smallest(zone, order, migratetype); 1155 page = __rmqueue_smallest(zone, order, migratetype);
1156 1156
1157 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1157 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1158 page = __rmqueue_fallback(zone, order, migratetype); 1158 page = __rmqueue_fallback(zone, order, migratetype);
1159 1159
1160 /* 1160 /*
1161 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1161 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1162 * is used because __rmqueue_smallest is an inline function 1162 * is used because __rmqueue_smallest is an inline function
1163 * and we want just one call site 1163 * and we want just one call site
1164 */ 1164 */
1165 if (!page) { 1165 if (!page) {
1166 migratetype = MIGRATE_RESERVE; 1166 migratetype = MIGRATE_RESERVE;
1167 goto retry_reserve; 1167 goto retry_reserve;
1168 } 1168 }
1169 } 1169 }
1170 1170
1171 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1171 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1172 return page; 1172 return page;
1173 } 1173 }
1174 1174
1175 /* 1175 /*
1176 * Obtain a specified number of elements from the buddy allocator, all under 1176 * Obtain a specified number of elements from the buddy allocator, all under
1177 * a single hold of the lock, for efficiency. Add them to the supplied list. 1177 * a single hold of the lock, for efficiency. Add them to the supplied list.
1178 * Returns the number of new pages which were placed at *list. 1178 * Returns the number of new pages which were placed at *list.
1179 */ 1179 */
1180 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1180 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1181 unsigned long count, struct list_head *list, 1181 unsigned long count, struct list_head *list,
1182 int migratetype, int cold) 1182 int migratetype, int cold)
1183 { 1183 {
1184 int i; 1184 int i;
1185 1185
1186 spin_lock(&zone->lock); 1186 spin_lock(&zone->lock);
1187 for (i = 0; i < count; ++i) { 1187 for (i = 0; i < count; ++i) {
1188 struct page *page = __rmqueue(zone, order, migratetype); 1188 struct page *page = __rmqueue(zone, order, migratetype);
1189 if (unlikely(page == NULL)) 1189 if (unlikely(page == NULL))
1190 break; 1190 break;
1191 1191
1192 /* 1192 /*
1193 * Split buddy pages returned by expand() are received here 1193 * Split buddy pages returned by expand() are received here
1194 * in physical page order. The page is added to the callers and 1194 * in physical page order. The page is added to the callers and
1195 * list and the list head then moves forward. From the callers 1195 * list and the list head then moves forward. From the callers
1196 * perspective, the linked list is ordered by page number in 1196 * perspective, the linked list is ordered by page number in
1197 * some conditions. This is useful for IO devices that can 1197 * some conditions. This is useful for IO devices that can
1198 * merge IO requests if the physical pages are ordered 1198 * merge IO requests if the physical pages are ordered
1199 * properly. 1199 * properly.
1200 */ 1200 */
1201 if (likely(cold == 0)) 1201 if (likely(cold == 0))
1202 list_add(&page->lru, list); 1202 list_add(&page->lru, list);
1203 else 1203 else
1204 list_add_tail(&page->lru, list); 1204 list_add_tail(&page->lru, list);
1205 list = &page->lru; 1205 list = &page->lru;
1206 if (is_migrate_cma(get_freepage_migratetype(page))) 1206 if (is_migrate_cma(get_freepage_migratetype(page)))
1207 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1207 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1208 -(1 << order)); 1208 -(1 << order));
1209 } 1209 }
1210 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1210 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1211 spin_unlock(&zone->lock); 1211 spin_unlock(&zone->lock);
1212 return i; 1212 return i;
1213 } 1213 }
1214 1214
1215 #ifdef CONFIG_NUMA 1215 #ifdef CONFIG_NUMA
1216 /* 1216 /*
1217 * Called from the vmstat counter updater to drain pagesets of this 1217 * Called from the vmstat counter updater to drain pagesets of this
1218 * currently executing processor on remote nodes after they have 1218 * currently executing processor on remote nodes after they have
1219 * expired. 1219 * expired.
1220 * 1220 *
1221 * Note that this function must be called with the thread pinned to 1221 * Note that this function must be called with the thread pinned to
1222 * a single processor. 1222 * a single processor.
1223 */ 1223 */
1224 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1224 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1225 { 1225 {
1226 unsigned long flags; 1226 unsigned long flags;
1227 int to_drain; 1227 int to_drain;
1228 unsigned long batch; 1228 unsigned long batch;
1229 1229
1230 local_irq_save(flags); 1230 local_irq_save(flags);
1231 batch = ACCESS_ONCE(pcp->batch); 1231 batch = ACCESS_ONCE(pcp->batch);
1232 if (pcp->count >= batch) 1232 if (pcp->count >= batch)
1233 to_drain = batch; 1233 to_drain = batch;
1234 else 1234 else
1235 to_drain = pcp->count; 1235 to_drain = pcp->count;
1236 if (to_drain > 0) { 1236 if (to_drain > 0) {
1237 free_pcppages_bulk(zone, to_drain, pcp); 1237 free_pcppages_bulk(zone, to_drain, pcp);
1238 pcp->count -= to_drain; 1238 pcp->count -= to_drain;
1239 } 1239 }
1240 local_irq_restore(flags); 1240 local_irq_restore(flags);
1241 } 1241 }
1242 #endif 1242 #endif
1243 1243
1244 /* 1244 /*
1245 * Drain pages of the indicated processor. 1245 * Drain pages of the indicated processor.
1246 * 1246 *
1247 * The processor must either be the current processor and the 1247 * The processor must either be the current processor and the
1248 * thread pinned to the current processor or a processor that 1248 * thread pinned to the current processor or a processor that
1249 * is not online. 1249 * is not online.
1250 */ 1250 */
1251 static void drain_pages(unsigned int cpu) 1251 static void drain_pages(unsigned int cpu)
1252 { 1252 {
1253 unsigned long flags; 1253 unsigned long flags;
1254 struct zone *zone; 1254 struct zone *zone;
1255 1255
1256 for_each_populated_zone(zone) { 1256 for_each_populated_zone(zone) {
1257 struct per_cpu_pageset *pset; 1257 struct per_cpu_pageset *pset;
1258 struct per_cpu_pages *pcp; 1258 struct per_cpu_pages *pcp;
1259 1259
1260 local_irq_save(flags); 1260 local_irq_save(flags);
1261 pset = per_cpu_ptr(zone->pageset, cpu); 1261 pset = per_cpu_ptr(zone->pageset, cpu);
1262 1262
1263 pcp = &pset->pcp; 1263 pcp = &pset->pcp;
1264 if (pcp->count) { 1264 if (pcp->count) {
1265 free_pcppages_bulk(zone, pcp->count, pcp); 1265 free_pcppages_bulk(zone, pcp->count, pcp);
1266 pcp->count = 0; 1266 pcp->count = 0;
1267 } 1267 }
1268 local_irq_restore(flags); 1268 local_irq_restore(flags);
1269 } 1269 }
1270 } 1270 }
1271 1271
1272 /* 1272 /*
1273 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1273 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1274 */ 1274 */
1275 void drain_local_pages(void *arg) 1275 void drain_local_pages(void *arg)
1276 { 1276 {
1277 drain_pages(smp_processor_id()); 1277 drain_pages(smp_processor_id());
1278 } 1278 }
1279 1279
1280 /* 1280 /*
1281 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1281 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1282 * 1282 *
1283 * Note that this code is protected against sending an IPI to an offline 1283 * Note that this code is protected against sending an IPI to an offline
1284 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1284 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1285 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1285 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1286 * nothing keeps CPUs from showing up after we populated the cpumask and 1286 * nothing keeps CPUs from showing up after we populated the cpumask and
1287 * before the call to on_each_cpu_mask(). 1287 * before the call to on_each_cpu_mask().
1288 */ 1288 */
1289 void drain_all_pages(void) 1289 void drain_all_pages(void)
1290 { 1290 {
1291 int cpu; 1291 int cpu;
1292 struct per_cpu_pageset *pcp; 1292 struct per_cpu_pageset *pcp;
1293 struct zone *zone; 1293 struct zone *zone;
1294 1294
1295 /* 1295 /*
1296 * Allocate in the BSS so we wont require allocation in 1296 * Allocate in the BSS so we wont require allocation in
1297 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1297 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1298 */ 1298 */
1299 static cpumask_t cpus_with_pcps; 1299 static cpumask_t cpus_with_pcps;
1300 1300
1301 /* 1301 /*
1302 * We don't care about racing with CPU hotplug event 1302 * We don't care about racing with CPU hotplug event
1303 * as offline notification will cause the notified 1303 * as offline notification will cause the notified
1304 * cpu to drain that CPU pcps and on_each_cpu_mask 1304 * cpu to drain that CPU pcps and on_each_cpu_mask
1305 * disables preemption as part of its processing 1305 * disables preemption as part of its processing
1306 */ 1306 */
1307 for_each_online_cpu(cpu) { 1307 for_each_online_cpu(cpu) {
1308 bool has_pcps = false; 1308 bool has_pcps = false;
1309 for_each_populated_zone(zone) { 1309 for_each_populated_zone(zone) {
1310 pcp = per_cpu_ptr(zone->pageset, cpu); 1310 pcp = per_cpu_ptr(zone->pageset, cpu);
1311 if (pcp->pcp.count) { 1311 if (pcp->pcp.count) {
1312 has_pcps = true; 1312 has_pcps = true;
1313 break; 1313 break;
1314 } 1314 }
1315 } 1315 }
1316 if (has_pcps) 1316 if (has_pcps)
1317 cpumask_set_cpu(cpu, &cpus_with_pcps); 1317 cpumask_set_cpu(cpu, &cpus_with_pcps);
1318 else 1318 else
1319 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1319 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1320 } 1320 }
1321 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1321 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1322 } 1322 }
1323 1323
1324 #ifdef CONFIG_HIBERNATION 1324 #ifdef CONFIG_HIBERNATION
1325 1325
1326 void mark_free_pages(struct zone *zone) 1326 void mark_free_pages(struct zone *zone)
1327 { 1327 {
1328 unsigned long pfn, max_zone_pfn; 1328 unsigned long pfn, max_zone_pfn;
1329 unsigned long flags; 1329 unsigned long flags;
1330 int order, t; 1330 int order, t;
1331 struct list_head *curr; 1331 struct list_head *curr;
1332 1332
1333 if (zone_is_empty(zone)) 1333 if (zone_is_empty(zone))
1334 return; 1334 return;
1335 1335
1336 spin_lock_irqsave(&zone->lock, flags); 1336 spin_lock_irqsave(&zone->lock, flags);
1337 1337
1338 max_zone_pfn = zone_end_pfn(zone); 1338 max_zone_pfn = zone_end_pfn(zone);
1339 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1339 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1340 if (pfn_valid(pfn)) { 1340 if (pfn_valid(pfn)) {
1341 struct page *page = pfn_to_page(pfn); 1341 struct page *page = pfn_to_page(pfn);
1342 1342
1343 if (!swsusp_page_is_forbidden(page)) 1343 if (!swsusp_page_is_forbidden(page))
1344 swsusp_unset_page_free(page); 1344 swsusp_unset_page_free(page);
1345 } 1345 }
1346 1346
1347 for_each_migratetype_order(order, t) { 1347 for_each_migratetype_order(order, t) {
1348 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1348 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1349 unsigned long i; 1349 unsigned long i;
1350 1350
1351 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1351 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1352 for (i = 0; i < (1UL << order); i++) 1352 for (i = 0; i < (1UL << order); i++)
1353 swsusp_set_page_free(pfn_to_page(pfn + i)); 1353 swsusp_set_page_free(pfn_to_page(pfn + i));
1354 } 1354 }
1355 } 1355 }
1356 spin_unlock_irqrestore(&zone->lock, flags); 1356 spin_unlock_irqrestore(&zone->lock, flags);
1357 } 1357 }
1358 #endif /* CONFIG_PM */ 1358 #endif /* CONFIG_PM */
1359 1359
1360 /* 1360 /*
1361 * Free a 0-order page 1361 * Free a 0-order page
1362 * cold == 1 ? free a cold page : free a hot page 1362 * cold == 1 ? free a cold page : free a hot page
1363 */ 1363 */
1364 void free_hot_cold_page(struct page *page, int cold) 1364 void free_hot_cold_page(struct page *page, int cold)
1365 { 1365 {
1366 struct zone *zone = page_zone(page); 1366 struct zone *zone = page_zone(page);
1367 struct per_cpu_pages *pcp; 1367 struct per_cpu_pages *pcp;
1368 unsigned long flags; 1368 unsigned long flags;
1369 int migratetype; 1369 int migratetype;
1370 1370
1371 if (!free_pages_prepare(page, 0)) 1371 if (!free_pages_prepare(page, 0))
1372 return; 1372 return;
1373 1373
1374 migratetype = get_pageblock_migratetype(page); 1374 migratetype = get_pageblock_migratetype(page);
1375 set_freepage_migratetype(page, migratetype); 1375 set_freepage_migratetype(page, migratetype);
1376 local_irq_save(flags); 1376 local_irq_save(flags);
1377 __count_vm_event(PGFREE); 1377 __count_vm_event(PGFREE);
1378 1378
1379 /* 1379 /*
1380 * We only track unmovable, reclaimable and movable on pcp lists. 1380 * We only track unmovable, reclaimable and movable on pcp lists.
1381 * Free ISOLATE pages back to the allocator because they are being 1381 * Free ISOLATE pages back to the allocator because they are being
1382 * offlined but treat RESERVE as movable pages so we can get those 1382 * offlined but treat RESERVE as movable pages so we can get those
1383 * areas back if necessary. Otherwise, we may have to free 1383 * areas back if necessary. Otherwise, we may have to free
1384 * excessively into the page allocator 1384 * excessively into the page allocator
1385 */ 1385 */
1386 if (migratetype >= MIGRATE_PCPTYPES) { 1386 if (migratetype >= MIGRATE_PCPTYPES) {
1387 if (unlikely(is_migrate_isolate(migratetype))) { 1387 if (unlikely(is_migrate_isolate(migratetype))) {
1388 free_one_page(zone, page, 0, migratetype); 1388 free_one_page(zone, page, 0, migratetype);
1389 goto out; 1389 goto out;
1390 } 1390 }
1391 migratetype = MIGRATE_MOVABLE; 1391 migratetype = MIGRATE_MOVABLE;
1392 } 1392 }
1393 1393
1394 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1394 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1395 if (cold) 1395 if (cold)
1396 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1396 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1397 else 1397 else
1398 list_add(&page->lru, &pcp->lists[migratetype]); 1398 list_add(&page->lru, &pcp->lists[migratetype]);
1399 pcp->count++; 1399 pcp->count++;
1400 if (pcp->count >= pcp->high) { 1400 if (pcp->count >= pcp->high) {
1401 unsigned long batch = ACCESS_ONCE(pcp->batch); 1401 unsigned long batch = ACCESS_ONCE(pcp->batch);
1402 free_pcppages_bulk(zone, batch, pcp); 1402 free_pcppages_bulk(zone, batch, pcp);
1403 pcp->count -= batch; 1403 pcp->count -= batch;
1404 } 1404 }
1405 1405
1406 out: 1406 out:
1407 local_irq_restore(flags); 1407 local_irq_restore(flags);
1408 } 1408 }
1409 1409
1410 /* 1410 /*
1411 * Free a list of 0-order pages 1411 * Free a list of 0-order pages
1412 */ 1412 */
1413 void free_hot_cold_page_list(struct list_head *list, int cold) 1413 void free_hot_cold_page_list(struct list_head *list, int cold)
1414 { 1414 {
1415 struct page *page, *next; 1415 struct page *page, *next;
1416 1416
1417 list_for_each_entry_safe(page, next, list, lru) { 1417 list_for_each_entry_safe(page, next, list, lru) {
1418 trace_mm_page_free_batched(page, cold); 1418 trace_mm_page_free_batched(page, cold);
1419 free_hot_cold_page(page, cold); 1419 free_hot_cold_page(page, cold);
1420 } 1420 }
1421 } 1421 }
1422 1422
1423 /* 1423 /*
1424 * split_page takes a non-compound higher-order page, and splits it into 1424 * split_page takes a non-compound higher-order page, and splits it into
1425 * n (1<<order) sub-pages: page[0..n] 1425 * n (1<<order) sub-pages: page[0..n]
1426 * Each sub-page must be freed individually. 1426 * Each sub-page must be freed individually.
1427 * 1427 *
1428 * Note: this is probably too low level an operation for use in drivers. 1428 * Note: this is probably too low level an operation for use in drivers.
1429 * Please consult with lkml before using this in your driver. 1429 * Please consult with lkml before using this in your driver.
1430 */ 1430 */
1431 void split_page(struct page *page, unsigned int order) 1431 void split_page(struct page *page, unsigned int order)
1432 { 1432 {
1433 int i; 1433 int i;
1434 1434
1435 VM_BUG_ON(PageCompound(page)); 1435 VM_BUG_ON(PageCompound(page));
1436 VM_BUG_ON(!page_count(page)); 1436 VM_BUG_ON(!page_count(page));
1437 1437
1438 #ifdef CONFIG_KMEMCHECK 1438 #ifdef CONFIG_KMEMCHECK
1439 /* 1439 /*
1440 * Split shadow pages too, because free(page[0]) would 1440 * Split shadow pages too, because free(page[0]) would
1441 * otherwise free the whole shadow. 1441 * otherwise free the whole shadow.
1442 */ 1442 */
1443 if (kmemcheck_page_is_tracked(page)) 1443 if (kmemcheck_page_is_tracked(page))
1444 split_page(virt_to_page(page[0].shadow), order); 1444 split_page(virt_to_page(page[0].shadow), order);
1445 #endif 1445 #endif
1446 1446
1447 for (i = 1; i < (1 << order); i++) 1447 for (i = 1; i < (1 << order); i++)
1448 set_page_refcounted(page + i); 1448 set_page_refcounted(page + i);
1449 } 1449 }
1450 EXPORT_SYMBOL_GPL(split_page); 1450 EXPORT_SYMBOL_GPL(split_page);
1451 1451
1452 static int __isolate_free_page(struct page *page, unsigned int order) 1452 static int __isolate_free_page(struct page *page, unsigned int order)
1453 { 1453 {
1454 unsigned long watermark; 1454 unsigned long watermark;
1455 struct zone *zone; 1455 struct zone *zone;
1456 int mt; 1456 int mt;
1457 1457
1458 BUG_ON(!PageBuddy(page)); 1458 BUG_ON(!PageBuddy(page));
1459 1459
1460 zone = page_zone(page); 1460 zone = page_zone(page);
1461 mt = get_pageblock_migratetype(page); 1461 mt = get_pageblock_migratetype(page);
1462 1462
1463 if (!is_migrate_isolate(mt)) { 1463 if (!is_migrate_isolate(mt)) {
1464 /* Obey watermarks as if the page was being allocated */ 1464 /* Obey watermarks as if the page was being allocated */
1465 watermark = low_wmark_pages(zone) + (1 << order); 1465 watermark = low_wmark_pages(zone) + (1 << order);
1466 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1466 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1467 return 0; 1467 return 0;
1468 1468
1469 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1469 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1470 } 1470 }
1471 1471
1472 /* Remove page from free list */ 1472 /* Remove page from free list */
1473 list_del(&page->lru); 1473 list_del(&page->lru);
1474 zone->free_area[order].nr_free--; 1474 zone->free_area[order].nr_free--;
1475 rmv_page_order(page); 1475 rmv_page_order(page);
1476 1476
1477 /* Set the pageblock if the isolated page is at least a pageblock */ 1477 /* Set the pageblock if the isolated page is at least a pageblock */
1478 if (order >= pageblock_order - 1) { 1478 if (order >= pageblock_order - 1) {
1479 struct page *endpage = page + (1 << order) - 1; 1479 struct page *endpage = page + (1 << order) - 1;
1480 for (; page < endpage; page += pageblock_nr_pages) { 1480 for (; page < endpage; page += pageblock_nr_pages) {
1481 int mt = get_pageblock_migratetype(page); 1481 int mt = get_pageblock_migratetype(page);
1482 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1482 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1483 set_pageblock_migratetype(page, 1483 set_pageblock_migratetype(page,
1484 MIGRATE_MOVABLE); 1484 MIGRATE_MOVABLE);
1485 } 1485 }
1486 } 1486 }
1487 1487
1488 return 1UL << order; 1488 return 1UL << order;
1489 } 1489 }
1490 1490
1491 /* 1491 /*
1492 * Similar to split_page except the page is already free. As this is only 1492 * Similar to split_page except the page is already free. As this is only
1493 * being used for migration, the migratetype of the block also changes. 1493 * being used for migration, the migratetype of the block also changes.
1494 * As this is called with interrupts disabled, the caller is responsible 1494 * As this is called with interrupts disabled, the caller is responsible
1495 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1495 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1496 * are enabled. 1496 * are enabled.
1497 * 1497 *
1498 * Note: this is probably too low level an operation for use in drivers. 1498 * Note: this is probably too low level an operation for use in drivers.
1499 * Please consult with lkml before using this in your driver. 1499 * Please consult with lkml before using this in your driver.
1500 */ 1500 */
1501 int split_free_page(struct page *page) 1501 int split_free_page(struct page *page)
1502 { 1502 {
1503 unsigned int order; 1503 unsigned int order;
1504 int nr_pages; 1504 int nr_pages;
1505 1505
1506 order = page_order(page); 1506 order = page_order(page);
1507 1507
1508 nr_pages = __isolate_free_page(page, order); 1508 nr_pages = __isolate_free_page(page, order);
1509 if (!nr_pages) 1509 if (!nr_pages)
1510 return 0; 1510 return 0;
1511 1511
1512 /* Split into individual pages */ 1512 /* Split into individual pages */
1513 set_page_refcounted(page); 1513 set_page_refcounted(page);
1514 split_page(page, order); 1514 split_page(page, order);
1515 return nr_pages; 1515 return nr_pages;
1516 } 1516 }
1517 1517
1518 /* 1518 /*
1519 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1519 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1520 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1520 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1521 * or two. 1521 * or two.
1522 */ 1522 */
1523 static inline 1523 static inline
1524 struct page *buffered_rmqueue(struct zone *preferred_zone, 1524 struct page *buffered_rmqueue(struct zone *preferred_zone,
1525 struct zone *zone, int order, gfp_t gfp_flags, 1525 struct zone *zone, int order, gfp_t gfp_flags,
1526 int migratetype) 1526 int migratetype)
1527 { 1527 {
1528 unsigned long flags; 1528 unsigned long flags;
1529 struct page *page; 1529 struct page *page;
1530 int cold = !!(gfp_flags & __GFP_COLD); 1530 int cold = !!(gfp_flags & __GFP_COLD);
1531 1531
1532 again: 1532 again:
1533 if (likely(order == 0)) { 1533 if (likely(order == 0)) {
1534 struct per_cpu_pages *pcp; 1534 struct per_cpu_pages *pcp;
1535 struct list_head *list; 1535 struct list_head *list;
1536 1536
1537 local_irq_save(flags); 1537 local_irq_save(flags);
1538 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1538 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1539 list = &pcp->lists[migratetype]; 1539 list = &pcp->lists[migratetype];
1540 if (list_empty(list)) { 1540 if (list_empty(list)) {
1541 pcp->count += rmqueue_bulk(zone, 0, 1541 pcp->count += rmqueue_bulk(zone, 0,
1542 pcp->batch, list, 1542 pcp->batch, list,
1543 migratetype, cold); 1543 migratetype, cold);
1544 if (unlikely(list_empty(list))) 1544 if (unlikely(list_empty(list)))
1545 goto failed; 1545 goto failed;
1546 } 1546 }
1547 1547
1548 if (cold) 1548 if (cold)
1549 page = list_entry(list->prev, struct page, lru); 1549 page = list_entry(list->prev, struct page, lru);
1550 else 1550 else
1551 page = list_entry(list->next, struct page, lru); 1551 page = list_entry(list->next, struct page, lru);
1552 1552
1553 list_del(&page->lru); 1553 list_del(&page->lru);
1554 pcp->count--; 1554 pcp->count--;
1555 } else { 1555 } else {
1556 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1556 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1557 /* 1557 /*
1558 * __GFP_NOFAIL is not to be used in new code. 1558 * __GFP_NOFAIL is not to be used in new code.
1559 * 1559 *
1560 * All __GFP_NOFAIL callers should be fixed so that they 1560 * All __GFP_NOFAIL callers should be fixed so that they
1561 * properly detect and handle allocation failures. 1561 * properly detect and handle allocation failures.
1562 * 1562 *
1563 * We most definitely don't want callers attempting to 1563 * We most definitely don't want callers attempting to
1564 * allocate greater than order-1 page units with 1564 * allocate greater than order-1 page units with
1565 * __GFP_NOFAIL. 1565 * __GFP_NOFAIL.
1566 */ 1566 */
1567 WARN_ON_ONCE(order > 1); 1567 WARN_ON_ONCE(order > 1);
1568 } 1568 }
1569 spin_lock_irqsave(&zone->lock, flags); 1569 spin_lock_irqsave(&zone->lock, flags);
1570 page = __rmqueue(zone, order, migratetype); 1570 page = __rmqueue(zone, order, migratetype);
1571 spin_unlock(&zone->lock); 1571 spin_unlock(&zone->lock);
1572 if (!page) 1572 if (!page)
1573 goto failed; 1573 goto failed;
1574 __mod_zone_freepage_state(zone, -(1 << order), 1574 __mod_zone_freepage_state(zone, -(1 << order),
1575 get_freepage_migratetype(page)); 1575 get_freepage_migratetype(page));
1576 } 1576 }
1577 1577
1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1579 1579
1580 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1580 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1581 zone_statistics(preferred_zone, zone, gfp_flags); 1581 zone_statistics(preferred_zone, zone, gfp_flags);
1582 local_irq_restore(flags); 1582 local_irq_restore(flags);
1583 1583
1584 VM_BUG_ON(bad_range(zone, page)); 1584 VM_BUG_ON(bad_range(zone, page));
1585 if (prep_new_page(page, order, gfp_flags)) 1585 if (prep_new_page(page, order, gfp_flags))
1586 goto again; 1586 goto again;
1587 return page; 1587 return page;
1588 1588
1589 failed: 1589 failed:
1590 local_irq_restore(flags); 1590 local_irq_restore(flags);
1591 return NULL; 1591 return NULL;
1592 } 1592 }
1593 1593
1594 #ifdef CONFIG_FAIL_PAGE_ALLOC 1594 #ifdef CONFIG_FAIL_PAGE_ALLOC
1595 1595
1596 static struct { 1596 static struct {
1597 struct fault_attr attr; 1597 struct fault_attr attr;
1598 1598
1599 u32 ignore_gfp_highmem; 1599 u32 ignore_gfp_highmem;
1600 u32 ignore_gfp_wait; 1600 u32 ignore_gfp_wait;
1601 u32 min_order; 1601 u32 min_order;
1602 } fail_page_alloc = { 1602 } fail_page_alloc = {
1603 .attr = FAULT_ATTR_INITIALIZER, 1603 .attr = FAULT_ATTR_INITIALIZER,
1604 .ignore_gfp_wait = 1, 1604 .ignore_gfp_wait = 1,
1605 .ignore_gfp_highmem = 1, 1605 .ignore_gfp_highmem = 1,
1606 .min_order = 1, 1606 .min_order = 1,
1607 }; 1607 };
1608 1608
1609 static int __init setup_fail_page_alloc(char *str) 1609 static int __init setup_fail_page_alloc(char *str)
1610 { 1610 {
1611 return setup_fault_attr(&fail_page_alloc.attr, str); 1611 return setup_fault_attr(&fail_page_alloc.attr, str);
1612 } 1612 }
1613 __setup("fail_page_alloc=", setup_fail_page_alloc); 1613 __setup("fail_page_alloc=", setup_fail_page_alloc);
1614 1614
1615 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1615 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1616 { 1616 {
1617 if (order < fail_page_alloc.min_order) 1617 if (order < fail_page_alloc.min_order)
1618 return false; 1618 return false;
1619 if (gfp_mask & __GFP_NOFAIL) 1619 if (gfp_mask & __GFP_NOFAIL)
1620 return false; 1620 return false;
1621 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1621 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1622 return false; 1622 return false;
1623 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1623 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1624 return false; 1624 return false;
1625 1625
1626 return should_fail(&fail_page_alloc.attr, 1 << order); 1626 return should_fail(&fail_page_alloc.attr, 1 << order);
1627 } 1627 }
1628 1628
1629 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1629 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1630 1630
1631 static int __init fail_page_alloc_debugfs(void) 1631 static int __init fail_page_alloc_debugfs(void)
1632 { 1632 {
1633 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1633 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1634 struct dentry *dir; 1634 struct dentry *dir;
1635 1635
1636 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1636 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1637 &fail_page_alloc.attr); 1637 &fail_page_alloc.attr);
1638 if (IS_ERR(dir)) 1638 if (IS_ERR(dir))
1639 return PTR_ERR(dir); 1639 return PTR_ERR(dir);
1640 1640
1641 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1641 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1642 &fail_page_alloc.ignore_gfp_wait)) 1642 &fail_page_alloc.ignore_gfp_wait))
1643 goto fail; 1643 goto fail;
1644 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1644 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1645 &fail_page_alloc.ignore_gfp_highmem)) 1645 &fail_page_alloc.ignore_gfp_highmem))
1646 goto fail; 1646 goto fail;
1647 if (!debugfs_create_u32("min-order", mode, dir, 1647 if (!debugfs_create_u32("min-order", mode, dir,
1648 &fail_page_alloc.min_order)) 1648 &fail_page_alloc.min_order))
1649 goto fail; 1649 goto fail;
1650 1650
1651 return 0; 1651 return 0;
1652 fail: 1652 fail:
1653 debugfs_remove_recursive(dir); 1653 debugfs_remove_recursive(dir);
1654 1654
1655 return -ENOMEM; 1655 return -ENOMEM;
1656 } 1656 }
1657 1657
1658 late_initcall(fail_page_alloc_debugfs); 1658 late_initcall(fail_page_alloc_debugfs);
1659 1659
1660 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1660 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1661 1661
1662 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1662 #else /* CONFIG_FAIL_PAGE_ALLOC */
1663 1663
1664 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1664 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1665 { 1665 {
1666 return false; 1666 return false;
1667 } 1667 }
1668 1668
1669 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1669 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1670 1670
1671 /* 1671 /*
1672 * Return true if free pages are above 'mark'. This takes into account the order 1672 * Return true if free pages are above 'mark'. This takes into account the order
1673 * of the allocation. 1673 * of the allocation.
1674 */ 1674 */
1675 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1675 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1676 int classzone_idx, int alloc_flags, long free_pages) 1676 int classzone_idx, int alloc_flags, long free_pages)
1677 { 1677 {
1678 /* free_pages my go negative - that's OK */ 1678 /* free_pages my go negative - that's OK */
1679 long min = mark; 1679 long min = mark;
1680 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1680 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1681 int o; 1681 int o;
1682 long free_cma = 0; 1682 long free_cma = 0;
1683 1683
1684 free_pages -= (1 << order) - 1; 1684 free_pages -= (1 << order) - 1;
1685 if (alloc_flags & ALLOC_HIGH) 1685 if (alloc_flags & ALLOC_HIGH)
1686 min -= min / 2; 1686 min -= min / 2;
1687 if (alloc_flags & ALLOC_HARDER) 1687 if (alloc_flags & ALLOC_HARDER)
1688 min -= min / 4; 1688 min -= min / 4;
1689 #ifdef CONFIG_CMA 1689 #ifdef CONFIG_CMA
1690 /* If allocation can't use CMA areas don't use free CMA pages */ 1690 /* If allocation can't use CMA areas don't use free CMA pages */
1691 if (!(alloc_flags & ALLOC_CMA)) 1691 if (!(alloc_flags & ALLOC_CMA))
1692 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1692 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1693 #endif 1693 #endif
1694 1694
1695 if (free_pages - free_cma <= min + lowmem_reserve) 1695 if (free_pages - free_cma <= min + lowmem_reserve)
1696 return false; 1696 return false;
1697 for (o = 0; o < order; o++) { 1697 for (o = 0; o < order; o++) {
1698 /* At the next order, this order's pages become unavailable */ 1698 /* At the next order, this order's pages become unavailable */
1699 free_pages -= z->free_area[o].nr_free << o; 1699 free_pages -= z->free_area[o].nr_free << o;
1700 1700
1701 /* Require fewer higher order pages to be free */ 1701 /* Require fewer higher order pages to be free */
1702 min >>= 1; 1702 min >>= 1;
1703 1703
1704 if (free_pages <= min) 1704 if (free_pages <= min)
1705 return false; 1705 return false;
1706 } 1706 }
1707 return true; 1707 return true;
1708 } 1708 }
1709 1709
1710 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1710 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1711 int classzone_idx, int alloc_flags) 1711 int classzone_idx, int alloc_flags)
1712 { 1712 {
1713 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1713 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1714 zone_page_state(z, NR_FREE_PAGES)); 1714 zone_page_state(z, NR_FREE_PAGES));
1715 } 1715 }
1716 1716
1717 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1717 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1718 int classzone_idx, int alloc_flags) 1718 int classzone_idx, int alloc_flags)
1719 { 1719 {
1720 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1720 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1721 1721
1722 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1722 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1723 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1723 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1724 1724
1725 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1725 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1726 free_pages); 1726 free_pages);
1727 } 1727 }
1728 1728
1729 #ifdef CONFIG_NUMA 1729 #ifdef CONFIG_NUMA
1730 /* 1730 /*
1731 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1731 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1732 * skip over zones that are not allowed by the cpuset, or that have 1732 * skip over zones that are not allowed by the cpuset, or that have
1733 * been recently (in last second) found to be nearly full. See further 1733 * been recently (in last second) found to be nearly full. See further
1734 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1734 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1735 * that have to skip over a lot of full or unallowed zones. 1735 * that have to skip over a lot of full or unallowed zones.
1736 * 1736 *
1737 * If the zonelist cache is present in the passed in zonelist, then 1737 * If the zonelist cache is present in the passed in zonelist, then
1738 * returns a pointer to the allowed node mask (either the current 1738 * returns a pointer to the allowed node mask (either the current
1739 * tasks mems_allowed, or node_states[N_MEMORY].) 1739 * tasks mems_allowed, or node_states[N_MEMORY].)
1740 * 1740 *
1741 * If the zonelist cache is not available for this zonelist, does 1741 * If the zonelist cache is not available for this zonelist, does
1742 * nothing and returns NULL. 1742 * nothing and returns NULL.
1743 * 1743 *
1744 * If the fullzones BITMAP in the zonelist cache is stale (more than 1744 * If the fullzones BITMAP in the zonelist cache is stale (more than
1745 * a second since last zap'd) then we zap it out (clear its bits.) 1745 * a second since last zap'd) then we zap it out (clear its bits.)
1746 * 1746 *
1747 * We hold off even calling zlc_setup, until after we've checked the 1747 * We hold off even calling zlc_setup, until after we've checked the
1748 * first zone in the zonelist, on the theory that most allocations will 1748 * first zone in the zonelist, on the theory that most allocations will
1749 * be satisfied from that first zone, so best to examine that zone as 1749 * be satisfied from that first zone, so best to examine that zone as
1750 * quickly as we can. 1750 * quickly as we can.
1751 */ 1751 */
1752 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1752 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1753 { 1753 {
1754 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1754 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1755 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1755 nodemask_t *allowednodes; /* zonelist_cache approximation */
1756 1756
1757 zlc = zonelist->zlcache_ptr; 1757 zlc = zonelist->zlcache_ptr;
1758 if (!zlc) 1758 if (!zlc)
1759 return NULL; 1759 return NULL;
1760 1760
1761 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1761 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1762 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1762 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1763 zlc->last_full_zap = jiffies; 1763 zlc->last_full_zap = jiffies;
1764 } 1764 }
1765 1765
1766 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1766 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1767 &cpuset_current_mems_allowed : 1767 &cpuset_current_mems_allowed :
1768 &node_states[N_MEMORY]; 1768 &node_states[N_MEMORY];
1769 return allowednodes; 1769 return allowednodes;
1770 } 1770 }
1771 1771
1772 /* 1772 /*
1773 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1773 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1774 * if it is worth looking at further for free memory: 1774 * if it is worth looking at further for free memory:
1775 * 1) Check that the zone isn't thought to be full (doesn't have its 1775 * 1) Check that the zone isn't thought to be full (doesn't have its
1776 * bit set in the zonelist_cache fullzones BITMAP). 1776 * bit set in the zonelist_cache fullzones BITMAP).
1777 * 2) Check that the zones node (obtained from the zonelist_cache 1777 * 2) Check that the zones node (obtained from the zonelist_cache
1778 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1778 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1779 * Return true (non-zero) if zone is worth looking at further, or 1779 * Return true (non-zero) if zone is worth looking at further, or
1780 * else return false (zero) if it is not. 1780 * else return false (zero) if it is not.
1781 * 1781 *
1782 * This check -ignores- the distinction between various watermarks, 1782 * This check -ignores- the distinction between various watermarks,
1783 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1783 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1784 * found to be full for any variation of these watermarks, it will 1784 * found to be full for any variation of these watermarks, it will
1785 * be considered full for up to one second by all requests, unless 1785 * be considered full for up to one second by all requests, unless
1786 * we are so low on memory on all allowed nodes that we are forced 1786 * we are so low on memory on all allowed nodes that we are forced
1787 * into the second scan of the zonelist. 1787 * into the second scan of the zonelist.
1788 * 1788 *
1789 * In the second scan we ignore this zonelist cache and exactly 1789 * In the second scan we ignore this zonelist cache and exactly
1790 * apply the watermarks to all zones, even it is slower to do so. 1790 * apply the watermarks to all zones, even it is slower to do so.
1791 * We are low on memory in the second scan, and should leave no stone 1791 * We are low on memory in the second scan, and should leave no stone
1792 * unturned looking for a free page. 1792 * unturned looking for a free page.
1793 */ 1793 */
1794 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1794 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1795 nodemask_t *allowednodes) 1795 nodemask_t *allowednodes)
1796 { 1796 {
1797 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1797 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1798 int i; /* index of *z in zonelist zones */ 1798 int i; /* index of *z in zonelist zones */
1799 int n; /* node that zone *z is on */ 1799 int n; /* node that zone *z is on */
1800 1800
1801 zlc = zonelist->zlcache_ptr; 1801 zlc = zonelist->zlcache_ptr;
1802 if (!zlc) 1802 if (!zlc)
1803 return 1; 1803 return 1;
1804 1804
1805 i = z - zonelist->_zonerefs; 1805 i = z - zonelist->_zonerefs;
1806 n = zlc->z_to_n[i]; 1806 n = zlc->z_to_n[i];
1807 1807
1808 /* This zone is worth trying if it is allowed but not full */ 1808 /* This zone is worth trying if it is allowed but not full */
1809 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1809 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1810 } 1810 }
1811 1811
1812 /* 1812 /*
1813 * Given 'z' scanning a zonelist, set the corresponding bit in 1813 * Given 'z' scanning a zonelist, set the corresponding bit in
1814 * zlc->fullzones, so that subsequent attempts to allocate a page 1814 * zlc->fullzones, so that subsequent attempts to allocate a page
1815 * from that zone don't waste time re-examining it. 1815 * from that zone don't waste time re-examining it.
1816 */ 1816 */
1817 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1817 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1818 { 1818 {
1819 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1819 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1820 int i; /* index of *z in zonelist zones */ 1820 int i; /* index of *z in zonelist zones */
1821 1821
1822 zlc = zonelist->zlcache_ptr; 1822 zlc = zonelist->zlcache_ptr;
1823 if (!zlc) 1823 if (!zlc)
1824 return; 1824 return;
1825 1825
1826 i = z - zonelist->_zonerefs; 1826 i = z - zonelist->_zonerefs;
1827 1827
1828 set_bit(i, zlc->fullzones); 1828 set_bit(i, zlc->fullzones);
1829 } 1829 }
1830 1830
1831 /* 1831 /*
1832 * clear all zones full, called after direct reclaim makes progress so that 1832 * clear all zones full, called after direct reclaim makes progress so that
1833 * a zone that was recently full is not skipped over for up to a second 1833 * a zone that was recently full is not skipped over for up to a second
1834 */ 1834 */
1835 static void zlc_clear_zones_full(struct zonelist *zonelist) 1835 static void zlc_clear_zones_full(struct zonelist *zonelist)
1836 { 1836 {
1837 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1837 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1838 1838
1839 zlc = zonelist->zlcache_ptr; 1839 zlc = zonelist->zlcache_ptr;
1840 if (!zlc) 1840 if (!zlc)
1841 return; 1841 return;
1842 1842
1843 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1843 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1844 } 1844 }
1845 1845
1846 static bool zone_local(struct zone *local_zone, struct zone *zone) 1846 static bool zone_local(struct zone *local_zone, struct zone *zone)
1847 { 1847 {
1848 return local_zone->node == zone->node; 1848 return local_zone->node == zone->node;
1849 } 1849 }
1850 1850
1851 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1851 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1852 { 1852 {
1853 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1853 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1854 } 1854 }
1855 1855
1856 static void __paginginit init_zone_allows_reclaim(int nid) 1856 static void __paginginit init_zone_allows_reclaim(int nid)
1857 { 1857 {
1858 int i; 1858 int i;
1859 1859
1860 for_each_node_state(i, N_MEMORY) 1860 for_each_node_state(i, N_MEMORY)
1861 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1861 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1862 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1862 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1863 else 1863 else
1864 zone_reclaim_mode = 1; 1864 zone_reclaim_mode = 1;
1865 } 1865 }
1866 1866
1867 #else /* CONFIG_NUMA */ 1867 #else /* CONFIG_NUMA */
1868 1868
1869 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1869 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1870 { 1870 {
1871 return NULL; 1871 return NULL;
1872 } 1872 }
1873 1873
1874 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1874 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1875 nodemask_t *allowednodes) 1875 nodemask_t *allowednodes)
1876 { 1876 {
1877 return 1; 1877 return 1;
1878 } 1878 }
1879 1879
1880 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1880 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1881 { 1881 {
1882 } 1882 }
1883 1883
1884 static void zlc_clear_zones_full(struct zonelist *zonelist) 1884 static void zlc_clear_zones_full(struct zonelist *zonelist)
1885 { 1885 {
1886 } 1886 }
1887 1887
1888 static bool zone_local(struct zone *local_zone, struct zone *zone) 1888 static bool zone_local(struct zone *local_zone, struct zone *zone)
1889 { 1889 {
1890 return true; 1890 return true;
1891 } 1891 }
1892 1892
1893 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1893 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1894 { 1894 {
1895 return true; 1895 return true;
1896 } 1896 }
1897 1897
1898 static inline void init_zone_allows_reclaim(int nid) 1898 static inline void init_zone_allows_reclaim(int nid)
1899 { 1899 {
1900 } 1900 }
1901 #endif /* CONFIG_NUMA */ 1901 #endif /* CONFIG_NUMA */
1902 1902
1903 /* 1903 /*
1904 * get_page_from_freelist goes through the zonelist trying to allocate 1904 * get_page_from_freelist goes through the zonelist trying to allocate
1905 * a page. 1905 * a page.
1906 */ 1906 */
1907 static struct page * 1907 static struct page *
1908 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1908 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1909 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1909 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1910 struct zone *preferred_zone, int classzone_idx, int migratetype) 1910 struct zone *preferred_zone, int classzone_idx, int migratetype)
1911 { 1911 {
1912 struct zoneref *z; 1912 struct zoneref *z;
1913 struct page *page = NULL; 1913 struct page *page = NULL;
1914 struct zone *zone; 1914 struct zone *zone;
1915 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1915 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1916 int zlc_active = 0; /* set if using zonelist_cache */ 1916 int zlc_active = 0; /* set if using zonelist_cache */
1917 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1917 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1918 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1918 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1919 (gfp_mask & __GFP_WRITE); 1919 (gfp_mask & __GFP_WRITE);
1920 1920
1921 zonelist_scan: 1921 zonelist_scan:
1922 /* 1922 /*
1923 * Scan zonelist, looking for a zone with enough free. 1923 * Scan zonelist, looking for a zone with enough free.
1924 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1924 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1925 */ 1925 */
1926 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1926 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1927 high_zoneidx, nodemask) { 1927 high_zoneidx, nodemask) {
1928 unsigned long mark; 1928 unsigned long mark;
1929 1929
1930 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1930 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1931 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1931 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1932 continue; 1932 continue;
1933 if (cpusets_enabled() && 1933 if (cpusets_enabled() &&
1934 (alloc_flags & ALLOC_CPUSET) && 1934 (alloc_flags & ALLOC_CPUSET) &&
1935 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1935 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1936 continue; 1936 continue;
1937 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1938 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1939 goto try_this_zone;
1940 /* 1937 /*
1941 * Distribute pages in proportion to the individual 1938 * Distribute pages in proportion to the individual
1942 * zone size to ensure fair page aging. The zone a 1939 * zone size to ensure fair page aging. The zone a
1943 * page was allocated in should have no effect on the 1940 * page was allocated in should have no effect on the
1944 * time the page has in memory before being reclaimed. 1941 * time the page has in memory before being reclaimed.
1945 */ 1942 */
1946 if (alloc_flags & ALLOC_FAIR) { 1943 if (alloc_flags & ALLOC_FAIR) {
1947 if (!zone_local(preferred_zone, zone)) 1944 if (!zone_local(preferred_zone, zone))
1948 continue; 1945 continue;
1949 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1946 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1950 continue; 1947 continue;
1951 } 1948 }
1952 /* 1949 /*
1953 * When allocating a page cache page for writing, we 1950 * When allocating a page cache page for writing, we
1954 * want to get it from a zone that is within its dirty 1951 * want to get it from a zone that is within its dirty
1955 * limit, such that no single zone holds more than its 1952 * limit, such that no single zone holds more than its
1956 * proportional share of globally allowed dirty pages. 1953 * proportional share of globally allowed dirty pages.
1957 * The dirty limits take into account the zone's 1954 * The dirty limits take into account the zone's
1958 * lowmem reserves and high watermark so that kswapd 1955 * lowmem reserves and high watermark so that kswapd
1959 * should be able to balance it without having to 1956 * should be able to balance it without having to
1960 * write pages from its LRU list. 1957 * write pages from its LRU list.
1961 * 1958 *
1962 * This may look like it could increase pressure on 1959 * This may look like it could increase pressure on
1963 * lower zones by failing allocations in higher zones 1960 * lower zones by failing allocations in higher zones
1964 * before they are full. But the pages that do spill 1961 * before they are full. But the pages that do spill
1965 * over are limited as the lower zones are protected 1962 * over are limited as the lower zones are protected
1966 * by this very same mechanism. It should not become 1963 * by this very same mechanism. It should not become
1967 * a practical burden to them. 1964 * a practical burden to them.
1968 * 1965 *
1969 * XXX: For now, allow allocations to potentially 1966 * XXX: For now, allow allocations to potentially
1970 * exceed the per-zone dirty limit in the slowpath 1967 * exceed the per-zone dirty limit in the slowpath
1971 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1968 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1972 * which is important when on a NUMA setup the allowed 1969 * which is important when on a NUMA setup the allowed
1973 * zones are together not big enough to reach the 1970 * zones are together not big enough to reach the
1974 * global limit. The proper fix for these situations 1971 * global limit. The proper fix for these situations
1975 * will require awareness of zones in the 1972 * will require awareness of zones in the
1976 * dirty-throttling and the flusher threads. 1973 * dirty-throttling and the flusher threads.
1977 */ 1974 */
1978 if (consider_zone_dirty && !zone_dirty_ok(zone)) 1975 if (consider_zone_dirty && !zone_dirty_ok(zone))
1979 continue; 1976 continue;
1980 1977
1981 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1978 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1982 if (!zone_watermark_ok(zone, order, mark, 1979 if (!zone_watermark_ok(zone, order, mark,
1983 classzone_idx, alloc_flags)) { 1980 classzone_idx, alloc_flags)) {
1984 int ret; 1981 int ret;
1982
1983 /* Checked here to keep the fast path fast */
1984 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1985 if (alloc_flags & ALLOC_NO_WATERMARKS)
1986 goto try_this_zone;
1985 1987
1986 if (IS_ENABLED(CONFIG_NUMA) && 1988 if (IS_ENABLED(CONFIG_NUMA) &&
1987 !did_zlc_setup && nr_online_nodes > 1) { 1989 !did_zlc_setup && nr_online_nodes > 1) {
1988 /* 1990 /*
1989 * we do zlc_setup if there are multiple nodes 1991 * we do zlc_setup if there are multiple nodes
1990 * and before considering the first zone allowed 1992 * and before considering the first zone allowed
1991 * by the cpuset. 1993 * by the cpuset.
1992 */ 1994 */
1993 allowednodes = zlc_setup(zonelist, alloc_flags); 1995 allowednodes = zlc_setup(zonelist, alloc_flags);
1994 zlc_active = 1; 1996 zlc_active = 1;
1995 did_zlc_setup = 1; 1997 did_zlc_setup = 1;
1996 } 1998 }
1997 1999
1998 if (zone_reclaim_mode == 0 || 2000 if (zone_reclaim_mode == 0 ||
1999 !zone_allows_reclaim(preferred_zone, zone)) 2001 !zone_allows_reclaim(preferred_zone, zone))
2000 goto this_zone_full; 2002 goto this_zone_full;
2001 2003
2002 /* 2004 /*
2003 * As we may have just activated ZLC, check if the first 2005 * As we may have just activated ZLC, check if the first
2004 * eligible zone has failed zone_reclaim recently. 2006 * eligible zone has failed zone_reclaim recently.
2005 */ 2007 */
2006 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2008 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
2007 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 2009 !zlc_zone_worth_trying(zonelist, z, allowednodes))
2008 continue; 2010 continue;
2009 2011
2010 ret = zone_reclaim(zone, gfp_mask, order); 2012 ret = zone_reclaim(zone, gfp_mask, order);
2011 switch (ret) { 2013 switch (ret) {
2012 case ZONE_RECLAIM_NOSCAN: 2014 case ZONE_RECLAIM_NOSCAN:
2013 /* did not scan */ 2015 /* did not scan */
2014 continue; 2016 continue;
2015 case ZONE_RECLAIM_FULL: 2017 case ZONE_RECLAIM_FULL:
2016 /* scanned but unreclaimable */ 2018 /* scanned but unreclaimable */
2017 continue; 2019 continue;
2018 default: 2020 default:
2019 /* did we reclaim enough */ 2021 /* did we reclaim enough */
2020 if (zone_watermark_ok(zone, order, mark, 2022 if (zone_watermark_ok(zone, order, mark,
2021 classzone_idx, alloc_flags)) 2023 classzone_idx, alloc_flags))
2022 goto try_this_zone; 2024 goto try_this_zone;
2023 2025
2024 /* 2026 /*
2025 * Failed to reclaim enough to meet watermark. 2027 * Failed to reclaim enough to meet watermark.
2026 * Only mark the zone full if checking the min 2028 * Only mark the zone full if checking the min
2027 * watermark or if we failed to reclaim just 2029 * watermark or if we failed to reclaim just
2028 * 1<<order pages or else the page allocator 2030 * 1<<order pages or else the page allocator
2029 * fastpath will prematurely mark zones full 2031 * fastpath will prematurely mark zones full
2030 * when the watermark is between the low and 2032 * when the watermark is between the low and
2031 * min watermarks. 2033 * min watermarks.
2032 */ 2034 */
2033 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2035 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2034 ret == ZONE_RECLAIM_SOME) 2036 ret == ZONE_RECLAIM_SOME)
2035 goto this_zone_full; 2037 goto this_zone_full;
2036 2038
2037 continue; 2039 continue;
2038 } 2040 }
2039 } 2041 }
2040 2042
2041 try_this_zone: 2043 try_this_zone:
2042 page = buffered_rmqueue(preferred_zone, zone, order, 2044 page = buffered_rmqueue(preferred_zone, zone, order,
2043 gfp_mask, migratetype); 2045 gfp_mask, migratetype);
2044 if (page) 2046 if (page)
2045 break; 2047 break;
2046 this_zone_full: 2048 this_zone_full:
2047 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2049 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2048 zlc_mark_zone_full(zonelist, z); 2050 zlc_mark_zone_full(zonelist, z);
2049 } 2051 }
2050 2052
2051 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2053 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2052 /* Disable zlc cache for second zonelist scan */ 2054 /* Disable zlc cache for second zonelist scan */
2053 zlc_active = 0; 2055 zlc_active = 0;
2054 goto zonelist_scan; 2056 goto zonelist_scan;
2055 } 2057 }
2056 2058
2057 if (page) 2059 if (page)
2058 /* 2060 /*
2059 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2061 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2060 * necessary to allocate the page. The expectation is 2062 * necessary to allocate the page. The expectation is
2061 * that the caller is taking steps that will free more 2063 * that the caller is taking steps that will free more
2062 * memory. The caller should avoid the page being used 2064 * memory. The caller should avoid the page being used
2063 * for !PFMEMALLOC purposes. 2065 * for !PFMEMALLOC purposes.
2064 */ 2066 */
2065 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2067 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2066 2068
2067 return page; 2069 return page;
2068 } 2070 }
2069 2071
2070 /* 2072 /*
2071 * Large machines with many possible nodes should not always dump per-node 2073 * Large machines with many possible nodes should not always dump per-node
2072 * meminfo in irq context. 2074 * meminfo in irq context.
2073 */ 2075 */
2074 static inline bool should_suppress_show_mem(void) 2076 static inline bool should_suppress_show_mem(void)
2075 { 2077 {
2076 bool ret = false; 2078 bool ret = false;
2077 2079
2078 #if NODES_SHIFT > 8 2080 #if NODES_SHIFT > 8
2079 ret = in_interrupt(); 2081 ret = in_interrupt();
2080 #endif 2082 #endif
2081 return ret; 2083 return ret;
2082 } 2084 }
2083 2085
2084 static DEFINE_RATELIMIT_STATE(nopage_rs, 2086 static DEFINE_RATELIMIT_STATE(nopage_rs,
2085 DEFAULT_RATELIMIT_INTERVAL, 2087 DEFAULT_RATELIMIT_INTERVAL,
2086 DEFAULT_RATELIMIT_BURST); 2088 DEFAULT_RATELIMIT_BURST);
2087 2089
2088 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2090 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2089 { 2091 {
2090 unsigned int filter = SHOW_MEM_FILTER_NODES; 2092 unsigned int filter = SHOW_MEM_FILTER_NODES;
2091 2093
2092 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2094 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2093 debug_guardpage_minorder() > 0) 2095 debug_guardpage_minorder() > 0)
2094 return; 2096 return;
2095 2097
2096 /* 2098 /*
2097 * Walking all memory to count page types is very expensive and should 2099 * Walking all memory to count page types is very expensive and should
2098 * be inhibited in non-blockable contexts. 2100 * be inhibited in non-blockable contexts.
2099 */ 2101 */
2100 if (!(gfp_mask & __GFP_WAIT)) 2102 if (!(gfp_mask & __GFP_WAIT))
2101 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2103 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2102 2104
2103 /* 2105 /*
2104 * This documents exceptions given to allocations in certain 2106 * This documents exceptions given to allocations in certain
2105 * contexts that are allowed to allocate outside current's set 2107 * contexts that are allowed to allocate outside current's set
2106 * of allowed nodes. 2108 * of allowed nodes.
2107 */ 2109 */
2108 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2110 if (!(gfp_mask & __GFP_NOMEMALLOC))
2109 if (test_thread_flag(TIF_MEMDIE) || 2111 if (test_thread_flag(TIF_MEMDIE) ||
2110 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2112 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2111 filter &= ~SHOW_MEM_FILTER_NODES; 2113 filter &= ~SHOW_MEM_FILTER_NODES;
2112 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2114 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2113 filter &= ~SHOW_MEM_FILTER_NODES; 2115 filter &= ~SHOW_MEM_FILTER_NODES;
2114 2116
2115 if (fmt) { 2117 if (fmt) {
2116 struct va_format vaf; 2118 struct va_format vaf;
2117 va_list args; 2119 va_list args;
2118 2120
2119 va_start(args, fmt); 2121 va_start(args, fmt);
2120 2122
2121 vaf.fmt = fmt; 2123 vaf.fmt = fmt;
2122 vaf.va = &args; 2124 vaf.va = &args;
2123 2125
2124 pr_warn("%pV", &vaf); 2126 pr_warn("%pV", &vaf);
2125 2127
2126 va_end(args); 2128 va_end(args);
2127 } 2129 }
2128 2130
2129 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2131 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2130 current->comm, order, gfp_mask); 2132 current->comm, order, gfp_mask);
2131 2133
2132 dump_stack(); 2134 dump_stack();
2133 if (!should_suppress_show_mem()) 2135 if (!should_suppress_show_mem())
2134 show_mem(filter); 2136 show_mem(filter);
2135 } 2137 }
2136 2138
2137 static inline int 2139 static inline int
2138 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2140 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2139 unsigned long did_some_progress, 2141 unsigned long did_some_progress,
2140 unsigned long pages_reclaimed) 2142 unsigned long pages_reclaimed)
2141 { 2143 {
2142 /* Do not loop if specifically requested */ 2144 /* Do not loop if specifically requested */
2143 if (gfp_mask & __GFP_NORETRY) 2145 if (gfp_mask & __GFP_NORETRY)
2144 return 0; 2146 return 0;
2145 2147
2146 /* Always retry if specifically requested */ 2148 /* Always retry if specifically requested */
2147 if (gfp_mask & __GFP_NOFAIL) 2149 if (gfp_mask & __GFP_NOFAIL)
2148 return 1; 2150 return 1;
2149 2151
2150 /* 2152 /*
2151 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2153 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2152 * making forward progress without invoking OOM. Suspend also disables 2154 * making forward progress without invoking OOM. Suspend also disables
2153 * storage devices so kswapd will not help. Bail if we are suspending. 2155 * storage devices so kswapd will not help. Bail if we are suspending.
2154 */ 2156 */
2155 if (!did_some_progress && pm_suspended_storage()) 2157 if (!did_some_progress && pm_suspended_storage())
2156 return 0; 2158 return 0;
2157 2159
2158 /* 2160 /*
2159 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2161 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2160 * means __GFP_NOFAIL, but that may not be true in other 2162 * means __GFP_NOFAIL, but that may not be true in other
2161 * implementations. 2163 * implementations.
2162 */ 2164 */
2163 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2165 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2164 return 1; 2166 return 1;
2165 2167
2166 /* 2168 /*
2167 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2169 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2168 * specified, then we retry until we no longer reclaim any pages 2170 * specified, then we retry until we no longer reclaim any pages
2169 * (above), or we've reclaimed an order of pages at least as 2171 * (above), or we've reclaimed an order of pages at least as
2170 * large as the allocation's order. In both cases, if the 2172 * large as the allocation's order. In both cases, if the
2171 * allocation still fails, we stop retrying. 2173 * allocation still fails, we stop retrying.
2172 */ 2174 */
2173 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2175 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2174 return 1; 2176 return 1;
2175 2177
2176 return 0; 2178 return 0;
2177 } 2179 }
2178 2180
2179 static inline struct page * 2181 static inline struct page *
2180 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2182 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2181 struct zonelist *zonelist, enum zone_type high_zoneidx, 2183 struct zonelist *zonelist, enum zone_type high_zoneidx,
2182 nodemask_t *nodemask, struct zone *preferred_zone, 2184 nodemask_t *nodemask, struct zone *preferred_zone,
2183 int classzone_idx, int migratetype) 2185 int classzone_idx, int migratetype)
2184 { 2186 {
2185 struct page *page; 2187 struct page *page;
2186 2188
2187 /* Acquire the OOM killer lock for the zones in zonelist */ 2189 /* Acquire the OOM killer lock for the zones in zonelist */
2188 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2190 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2189 schedule_timeout_uninterruptible(1); 2191 schedule_timeout_uninterruptible(1);
2190 return NULL; 2192 return NULL;
2191 } 2193 }
2192 2194
2193 /* 2195 /*
2194 * Go through the zonelist yet one more time, keep very high watermark 2196 * Go through the zonelist yet one more time, keep very high watermark
2195 * here, this is only to catch a parallel oom killing, we must fail if 2197 * here, this is only to catch a parallel oom killing, we must fail if
2196 * we're still under heavy pressure. 2198 * we're still under heavy pressure.
2197 */ 2199 */
2198 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2200 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2199 order, zonelist, high_zoneidx, 2201 order, zonelist, high_zoneidx,
2200 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2202 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2201 preferred_zone, classzone_idx, migratetype); 2203 preferred_zone, classzone_idx, migratetype);
2202 if (page) 2204 if (page)
2203 goto out; 2205 goto out;
2204 2206
2205 if (!(gfp_mask & __GFP_NOFAIL)) { 2207 if (!(gfp_mask & __GFP_NOFAIL)) {
2206 /* The OOM killer will not help higher order allocs */ 2208 /* The OOM killer will not help higher order allocs */
2207 if (order > PAGE_ALLOC_COSTLY_ORDER) 2209 if (order > PAGE_ALLOC_COSTLY_ORDER)
2208 goto out; 2210 goto out;
2209 /* The OOM killer does not needlessly kill tasks for lowmem */ 2211 /* The OOM killer does not needlessly kill tasks for lowmem */
2210 if (high_zoneidx < ZONE_NORMAL) 2212 if (high_zoneidx < ZONE_NORMAL)
2211 goto out; 2213 goto out;
2212 /* 2214 /*
2213 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2215 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2214 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2216 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2215 * The caller should handle page allocation failure by itself if 2217 * The caller should handle page allocation failure by itself if
2216 * it specifies __GFP_THISNODE. 2218 * it specifies __GFP_THISNODE.
2217 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2219 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2218 */ 2220 */
2219 if (gfp_mask & __GFP_THISNODE) 2221 if (gfp_mask & __GFP_THISNODE)
2220 goto out; 2222 goto out;
2221 } 2223 }
2222 /* Exhausted what can be done so it's blamo time */ 2224 /* Exhausted what can be done so it's blamo time */
2223 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2225 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2224 2226
2225 out: 2227 out:
2226 clear_zonelist_oom(zonelist, gfp_mask); 2228 clear_zonelist_oom(zonelist, gfp_mask);
2227 return page; 2229 return page;
2228 } 2230 }
2229 2231
2230 #ifdef CONFIG_COMPACTION 2232 #ifdef CONFIG_COMPACTION
2231 /* Try memory compaction for high-order allocations before reclaim */ 2233 /* Try memory compaction for high-order allocations before reclaim */
2232 static struct page * 2234 static struct page *
2233 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2235 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2234 struct zonelist *zonelist, enum zone_type high_zoneidx, 2236 struct zonelist *zonelist, enum zone_type high_zoneidx,
2235 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2237 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2236 int classzone_idx, int migratetype, enum migrate_mode mode, 2238 int classzone_idx, int migratetype, enum migrate_mode mode,
2237 bool *contended_compaction, bool *deferred_compaction, 2239 bool *contended_compaction, bool *deferred_compaction,
2238 unsigned long *did_some_progress) 2240 unsigned long *did_some_progress)
2239 { 2241 {
2240 if (!order) 2242 if (!order)
2241 return NULL; 2243 return NULL;
2242 2244
2243 if (compaction_deferred(preferred_zone, order)) { 2245 if (compaction_deferred(preferred_zone, order)) {
2244 *deferred_compaction = true; 2246 *deferred_compaction = true;
2245 return NULL; 2247 return NULL;
2246 } 2248 }
2247 2249
2248 current->flags |= PF_MEMALLOC; 2250 current->flags |= PF_MEMALLOC;
2249 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2251 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2250 nodemask, mode, 2252 nodemask, mode,
2251 contended_compaction); 2253 contended_compaction);
2252 current->flags &= ~PF_MEMALLOC; 2254 current->flags &= ~PF_MEMALLOC;
2253 2255
2254 if (*did_some_progress != COMPACT_SKIPPED) { 2256 if (*did_some_progress != COMPACT_SKIPPED) {
2255 struct page *page; 2257 struct page *page;
2256 2258
2257 /* Page migration frees to the PCP lists but we want merging */ 2259 /* Page migration frees to the PCP lists but we want merging */
2258 drain_pages(get_cpu()); 2260 drain_pages(get_cpu());
2259 put_cpu(); 2261 put_cpu();
2260 2262
2261 page = get_page_from_freelist(gfp_mask, nodemask, 2263 page = get_page_from_freelist(gfp_mask, nodemask,
2262 order, zonelist, high_zoneidx, 2264 order, zonelist, high_zoneidx,
2263 alloc_flags & ~ALLOC_NO_WATERMARKS, 2265 alloc_flags & ~ALLOC_NO_WATERMARKS,
2264 preferred_zone, classzone_idx, migratetype); 2266 preferred_zone, classzone_idx, migratetype);
2265 if (page) { 2267 if (page) {
2266 preferred_zone->compact_blockskip_flush = false; 2268 preferred_zone->compact_blockskip_flush = false;
2267 compaction_defer_reset(preferred_zone, order, true); 2269 compaction_defer_reset(preferred_zone, order, true);
2268 count_vm_event(COMPACTSUCCESS); 2270 count_vm_event(COMPACTSUCCESS);
2269 return page; 2271 return page;
2270 } 2272 }
2271 2273
2272 /* 2274 /*
2273 * It's bad if compaction run occurs and fails. 2275 * It's bad if compaction run occurs and fails.
2274 * The most likely reason is that pages exist, 2276 * The most likely reason is that pages exist,
2275 * but not enough to satisfy watermarks. 2277 * but not enough to satisfy watermarks.
2276 */ 2278 */
2277 count_vm_event(COMPACTFAIL); 2279 count_vm_event(COMPACTFAIL);
2278 2280
2279 /* 2281 /*
2280 * As async compaction considers a subset of pageblocks, only 2282 * As async compaction considers a subset of pageblocks, only
2281 * defer if the failure was a sync compaction failure. 2283 * defer if the failure was a sync compaction failure.
2282 */ 2284 */
2283 if (mode != MIGRATE_ASYNC) 2285 if (mode != MIGRATE_ASYNC)
2284 defer_compaction(preferred_zone, order); 2286 defer_compaction(preferred_zone, order);
2285 2287
2286 cond_resched(); 2288 cond_resched();
2287 } 2289 }
2288 2290
2289 return NULL; 2291 return NULL;
2290 } 2292 }
2291 #else 2293 #else
2292 static inline struct page * 2294 static inline struct page *
2293 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2295 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2294 struct zonelist *zonelist, enum zone_type high_zoneidx, 2296 struct zonelist *zonelist, enum zone_type high_zoneidx,
2295 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2297 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2296 int classzone_idx, int migratetype, 2298 int classzone_idx, int migratetype,
2297 enum migrate_mode mode, bool *contended_compaction, 2299 enum migrate_mode mode, bool *contended_compaction,
2298 bool *deferred_compaction, unsigned long *did_some_progress) 2300 bool *deferred_compaction, unsigned long *did_some_progress)
2299 { 2301 {
2300 return NULL; 2302 return NULL;
2301 } 2303 }
2302 #endif /* CONFIG_COMPACTION */ 2304 #endif /* CONFIG_COMPACTION */
2303 2305
2304 /* Perform direct synchronous page reclaim */ 2306 /* Perform direct synchronous page reclaim */
2305 static int 2307 static int
2306 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2308 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2307 nodemask_t *nodemask) 2309 nodemask_t *nodemask)
2308 { 2310 {
2309 struct reclaim_state reclaim_state; 2311 struct reclaim_state reclaim_state;
2310 int progress; 2312 int progress;
2311 2313
2312 cond_resched(); 2314 cond_resched();
2313 2315
2314 /* We now go into synchronous reclaim */ 2316 /* We now go into synchronous reclaim */
2315 cpuset_memory_pressure_bump(); 2317 cpuset_memory_pressure_bump();
2316 current->flags |= PF_MEMALLOC; 2318 current->flags |= PF_MEMALLOC;
2317 lockdep_set_current_reclaim_state(gfp_mask); 2319 lockdep_set_current_reclaim_state(gfp_mask);
2318 reclaim_state.reclaimed_slab = 0; 2320 reclaim_state.reclaimed_slab = 0;
2319 current->reclaim_state = &reclaim_state; 2321 current->reclaim_state = &reclaim_state;
2320 2322
2321 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2323 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2322 2324
2323 current->reclaim_state = NULL; 2325 current->reclaim_state = NULL;
2324 lockdep_clear_current_reclaim_state(); 2326 lockdep_clear_current_reclaim_state();
2325 current->flags &= ~PF_MEMALLOC; 2327 current->flags &= ~PF_MEMALLOC;
2326 2328
2327 cond_resched(); 2329 cond_resched();
2328 2330
2329 return progress; 2331 return progress;
2330 } 2332 }
2331 2333
2332 /* The really slow allocator path where we enter direct reclaim */ 2334 /* The really slow allocator path where we enter direct reclaim */
2333 static inline struct page * 2335 static inline struct page *
2334 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2336 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2335 struct zonelist *zonelist, enum zone_type high_zoneidx, 2337 struct zonelist *zonelist, enum zone_type high_zoneidx,
2336 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2338 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2337 int classzone_idx, int migratetype, unsigned long *did_some_progress) 2339 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2338 { 2340 {
2339 struct page *page = NULL; 2341 struct page *page = NULL;
2340 bool drained = false; 2342 bool drained = false;
2341 2343
2342 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2344 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2343 nodemask); 2345 nodemask);
2344 if (unlikely(!(*did_some_progress))) 2346 if (unlikely(!(*did_some_progress)))
2345 return NULL; 2347 return NULL;
2346 2348
2347 /* After successful reclaim, reconsider all zones for allocation */ 2349 /* After successful reclaim, reconsider all zones for allocation */
2348 if (IS_ENABLED(CONFIG_NUMA)) 2350 if (IS_ENABLED(CONFIG_NUMA))
2349 zlc_clear_zones_full(zonelist); 2351 zlc_clear_zones_full(zonelist);
2350 2352
2351 retry: 2353 retry:
2352 page = get_page_from_freelist(gfp_mask, nodemask, order, 2354 page = get_page_from_freelist(gfp_mask, nodemask, order,
2353 zonelist, high_zoneidx, 2355 zonelist, high_zoneidx,
2354 alloc_flags & ~ALLOC_NO_WATERMARKS, 2356 alloc_flags & ~ALLOC_NO_WATERMARKS,
2355 preferred_zone, classzone_idx, 2357 preferred_zone, classzone_idx,
2356 migratetype); 2358 migratetype);
2357 2359
2358 /* 2360 /*
2359 * If an allocation failed after direct reclaim, it could be because 2361 * If an allocation failed after direct reclaim, it could be because
2360 * pages are pinned on the per-cpu lists. Drain them and try again 2362 * pages are pinned on the per-cpu lists. Drain them and try again
2361 */ 2363 */
2362 if (!page && !drained) { 2364 if (!page && !drained) {
2363 drain_all_pages(); 2365 drain_all_pages();
2364 drained = true; 2366 drained = true;
2365 goto retry; 2367 goto retry;
2366 } 2368 }
2367 2369
2368 return page; 2370 return page;
2369 } 2371 }
2370 2372
2371 /* 2373 /*
2372 * This is called in the allocator slow-path if the allocation request is of 2374 * This is called in the allocator slow-path if the allocation request is of
2373 * sufficient urgency to ignore watermarks and take other desperate measures 2375 * sufficient urgency to ignore watermarks and take other desperate measures
2374 */ 2376 */
2375 static inline struct page * 2377 static inline struct page *
2376 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2378 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2377 struct zonelist *zonelist, enum zone_type high_zoneidx, 2379 struct zonelist *zonelist, enum zone_type high_zoneidx,
2378 nodemask_t *nodemask, struct zone *preferred_zone, 2380 nodemask_t *nodemask, struct zone *preferred_zone,
2379 int classzone_idx, int migratetype) 2381 int classzone_idx, int migratetype)
2380 { 2382 {
2381 struct page *page; 2383 struct page *page;
2382 2384
2383 do { 2385 do {
2384 page = get_page_from_freelist(gfp_mask, nodemask, order, 2386 page = get_page_from_freelist(gfp_mask, nodemask, order,
2385 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2387 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2386 preferred_zone, classzone_idx, migratetype); 2388 preferred_zone, classzone_idx, migratetype);
2387 2389
2388 if (!page && gfp_mask & __GFP_NOFAIL) 2390 if (!page && gfp_mask & __GFP_NOFAIL)
2389 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2391 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2390 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2392 } while (!page && (gfp_mask & __GFP_NOFAIL));
2391 2393
2392 return page; 2394 return page;
2393 } 2395 }
2394 2396
2395 static void reset_alloc_batches(struct zonelist *zonelist, 2397 static void reset_alloc_batches(struct zonelist *zonelist,
2396 enum zone_type high_zoneidx, 2398 enum zone_type high_zoneidx,
2397 struct zone *preferred_zone) 2399 struct zone *preferred_zone)
2398 { 2400 {
2399 struct zoneref *z; 2401 struct zoneref *z;
2400 struct zone *zone; 2402 struct zone *zone;
2401 2403
2402 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2404 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2403 /* 2405 /*
2404 * Only reset the batches of zones that were actually 2406 * Only reset the batches of zones that were actually
2405 * considered in the fairness pass, we don't want to 2407 * considered in the fairness pass, we don't want to
2406 * trash fairness information for zones that are not 2408 * trash fairness information for zones that are not
2407 * actually part of this zonelist's round-robin cycle. 2409 * actually part of this zonelist's round-robin cycle.
2408 */ 2410 */
2409 if (!zone_local(preferred_zone, zone)) 2411 if (!zone_local(preferred_zone, zone))
2410 continue; 2412 continue;
2411 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2413 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2412 high_wmark_pages(zone) - low_wmark_pages(zone) - 2414 high_wmark_pages(zone) - low_wmark_pages(zone) -
2413 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2415 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2414 } 2416 }
2415 } 2417 }
2416 2418
2417 static void wake_all_kswapds(unsigned int order, 2419 static void wake_all_kswapds(unsigned int order,
2418 struct zonelist *zonelist, 2420 struct zonelist *zonelist,
2419 enum zone_type high_zoneidx, 2421 enum zone_type high_zoneidx,
2420 struct zone *preferred_zone) 2422 struct zone *preferred_zone)
2421 { 2423 {
2422 struct zoneref *z; 2424 struct zoneref *z;
2423 struct zone *zone; 2425 struct zone *zone;
2424 2426
2425 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2427 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2426 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2428 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2427 } 2429 }
2428 2430
2429 static inline int 2431 static inline int
2430 gfp_to_alloc_flags(gfp_t gfp_mask) 2432 gfp_to_alloc_flags(gfp_t gfp_mask)
2431 { 2433 {
2432 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2434 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2433 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); 2435 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
2434 2436
2435 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2437 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2436 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2438 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2437 2439
2438 /* 2440 /*
2439 * The caller may dip into page reserves a bit more if the caller 2441 * The caller may dip into page reserves a bit more if the caller
2440 * cannot run direct reclaim, or if the caller has realtime scheduling 2442 * cannot run direct reclaim, or if the caller has realtime scheduling
2441 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2443 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2442 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). 2444 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
2443 */ 2445 */
2444 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2446 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2445 2447
2446 if (atomic) { 2448 if (atomic) {
2447 /* 2449 /*
2448 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2450 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2449 * if it can't schedule. 2451 * if it can't schedule.
2450 */ 2452 */
2451 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2453 if (!(gfp_mask & __GFP_NOMEMALLOC))
2452 alloc_flags |= ALLOC_HARDER; 2454 alloc_flags |= ALLOC_HARDER;
2453 /* 2455 /*
2454 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2456 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2455 * comment for __cpuset_node_allowed_softwall(). 2457 * comment for __cpuset_node_allowed_softwall().
2456 */ 2458 */
2457 alloc_flags &= ~ALLOC_CPUSET; 2459 alloc_flags &= ~ALLOC_CPUSET;
2458 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2460 } else if (unlikely(rt_task(current)) && !in_interrupt())
2459 alloc_flags |= ALLOC_HARDER; 2461 alloc_flags |= ALLOC_HARDER;
2460 2462
2461 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2463 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2462 if (gfp_mask & __GFP_MEMALLOC) 2464 if (gfp_mask & __GFP_MEMALLOC)
2463 alloc_flags |= ALLOC_NO_WATERMARKS; 2465 alloc_flags |= ALLOC_NO_WATERMARKS;
2464 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2466 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2465 alloc_flags |= ALLOC_NO_WATERMARKS; 2467 alloc_flags |= ALLOC_NO_WATERMARKS;
2466 else if (!in_interrupt() && 2468 else if (!in_interrupt() &&
2467 ((current->flags & PF_MEMALLOC) || 2469 ((current->flags & PF_MEMALLOC) ||
2468 unlikely(test_thread_flag(TIF_MEMDIE)))) 2470 unlikely(test_thread_flag(TIF_MEMDIE))))
2469 alloc_flags |= ALLOC_NO_WATERMARKS; 2471 alloc_flags |= ALLOC_NO_WATERMARKS;
2470 } 2472 }
2471 #ifdef CONFIG_CMA 2473 #ifdef CONFIG_CMA
2472 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2474 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2473 alloc_flags |= ALLOC_CMA; 2475 alloc_flags |= ALLOC_CMA;
2474 #endif 2476 #endif
2475 return alloc_flags; 2477 return alloc_flags;
2476 } 2478 }
2477 2479
2478 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2480 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2479 { 2481 {
2480 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2482 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2481 } 2483 }
2482 2484
2483 static inline struct page * 2485 static inline struct page *
2484 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2486 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2485 struct zonelist *zonelist, enum zone_type high_zoneidx, 2487 struct zonelist *zonelist, enum zone_type high_zoneidx,
2486 nodemask_t *nodemask, struct zone *preferred_zone, 2488 nodemask_t *nodemask, struct zone *preferred_zone,
2487 int classzone_idx, int migratetype) 2489 int classzone_idx, int migratetype)
2488 { 2490 {
2489 const gfp_t wait = gfp_mask & __GFP_WAIT; 2491 const gfp_t wait = gfp_mask & __GFP_WAIT;
2490 struct page *page = NULL; 2492 struct page *page = NULL;
2491 int alloc_flags; 2493 int alloc_flags;
2492 unsigned long pages_reclaimed = 0; 2494 unsigned long pages_reclaimed = 0;
2493 unsigned long did_some_progress; 2495 unsigned long did_some_progress;
2494 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2496 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2495 bool deferred_compaction = false; 2497 bool deferred_compaction = false;
2496 bool contended_compaction = false; 2498 bool contended_compaction = false;
2497 2499
2498 /* 2500 /*
2499 * In the slowpath, we sanity check order to avoid ever trying to 2501 * In the slowpath, we sanity check order to avoid ever trying to
2500 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2502 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2501 * be using allocators in order of preference for an area that is 2503 * be using allocators in order of preference for an area that is
2502 * too large. 2504 * too large.
2503 */ 2505 */
2504 if (order >= MAX_ORDER) { 2506 if (order >= MAX_ORDER) {
2505 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2507 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2506 return NULL; 2508 return NULL;
2507 } 2509 }
2508 2510
2509 /* 2511 /*
2510 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2512 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2511 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2513 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2512 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2514 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2513 * using a larger set of nodes after it has established that the 2515 * using a larger set of nodes after it has established that the
2514 * allowed per node queues are empty and that nodes are 2516 * allowed per node queues are empty and that nodes are
2515 * over allocated. 2517 * over allocated.
2516 */ 2518 */
2517 if (IS_ENABLED(CONFIG_NUMA) && 2519 if (IS_ENABLED(CONFIG_NUMA) &&
2518 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2520 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2519 goto nopage; 2521 goto nopage;
2520 2522
2521 restart: 2523 restart:
2522 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2524 if (!(gfp_mask & __GFP_NO_KSWAPD))
2523 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2525 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2524 2526
2525 /* 2527 /*
2526 * OK, we're below the kswapd watermark and have kicked background 2528 * OK, we're below the kswapd watermark and have kicked background
2527 * reclaim. Now things get more complex, so set up alloc_flags according 2529 * reclaim. Now things get more complex, so set up alloc_flags according
2528 * to how we want to proceed. 2530 * to how we want to proceed.
2529 */ 2531 */
2530 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2532 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2531 2533
2532 /* 2534 /*
2533 * Find the true preferred zone if the allocation is unconstrained by 2535 * Find the true preferred zone if the allocation is unconstrained by
2534 * cpusets. 2536 * cpusets.
2535 */ 2537 */
2536 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2538 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2537 struct zoneref *preferred_zoneref; 2539 struct zoneref *preferred_zoneref;
2538 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2540 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2539 NULL, 2541 NULL,
2540 &preferred_zone); 2542 &preferred_zone);
2541 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2543 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2542 } 2544 }
2543 2545
2544 rebalance: 2546 rebalance:
2545 /* This is the last chance, in general, before the goto nopage. */ 2547 /* This is the last chance, in general, before the goto nopage. */
2546 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2548 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2547 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2549 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2548 preferred_zone, classzone_idx, migratetype); 2550 preferred_zone, classzone_idx, migratetype);
2549 if (page) 2551 if (page)
2550 goto got_pg; 2552 goto got_pg;
2551 2553
2552 /* Allocate without watermarks if the context allows */ 2554 /* Allocate without watermarks if the context allows */
2553 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2555 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2554 /* 2556 /*
2555 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2557 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2556 * the allocation is high priority and these type of 2558 * the allocation is high priority and these type of
2557 * allocations are system rather than user orientated 2559 * allocations are system rather than user orientated
2558 */ 2560 */
2559 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2561 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2560 2562
2561 page = __alloc_pages_high_priority(gfp_mask, order, 2563 page = __alloc_pages_high_priority(gfp_mask, order,
2562 zonelist, high_zoneidx, nodemask, 2564 zonelist, high_zoneidx, nodemask,
2563 preferred_zone, classzone_idx, migratetype); 2565 preferred_zone, classzone_idx, migratetype);
2564 if (page) { 2566 if (page) {
2565 goto got_pg; 2567 goto got_pg;
2566 } 2568 }
2567 } 2569 }
2568 2570
2569 /* Atomic allocations - we can't balance anything */ 2571 /* Atomic allocations - we can't balance anything */
2570 if (!wait) 2572 if (!wait)
2571 goto nopage; 2573 goto nopage;
2572 2574
2573 /* Avoid recursion of direct reclaim */ 2575 /* Avoid recursion of direct reclaim */
2574 if (current->flags & PF_MEMALLOC) 2576 if (current->flags & PF_MEMALLOC)
2575 goto nopage; 2577 goto nopage;
2576 2578
2577 /* Avoid allocations with no watermarks from looping endlessly */ 2579 /* Avoid allocations with no watermarks from looping endlessly */
2578 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2580 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2579 goto nopage; 2581 goto nopage;
2580 2582
2581 /* 2583 /*
2582 * Try direct compaction. The first pass is asynchronous. Subsequent 2584 * Try direct compaction. The first pass is asynchronous. Subsequent
2583 * attempts after direct reclaim are synchronous 2585 * attempts after direct reclaim are synchronous
2584 */ 2586 */
2585 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2587 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2586 high_zoneidx, nodemask, alloc_flags, 2588 high_zoneidx, nodemask, alloc_flags,
2587 preferred_zone, 2589 preferred_zone,
2588 classzone_idx, migratetype, 2590 classzone_idx, migratetype,
2589 migration_mode, &contended_compaction, 2591 migration_mode, &contended_compaction,
2590 &deferred_compaction, 2592 &deferred_compaction,
2591 &did_some_progress); 2593 &did_some_progress);
2592 if (page) 2594 if (page)
2593 goto got_pg; 2595 goto got_pg;
2594 migration_mode = MIGRATE_SYNC_LIGHT; 2596 migration_mode = MIGRATE_SYNC_LIGHT;
2595 2597
2596 /* 2598 /*
2597 * If compaction is deferred for high-order allocations, it is because 2599 * If compaction is deferred for high-order allocations, it is because
2598 * sync compaction recently failed. In this is the case and the caller 2600 * sync compaction recently failed. In this is the case and the caller
2599 * requested a movable allocation that does not heavily disrupt the 2601 * requested a movable allocation that does not heavily disrupt the
2600 * system then fail the allocation instead of entering direct reclaim. 2602 * system then fail the allocation instead of entering direct reclaim.
2601 */ 2603 */
2602 if ((deferred_compaction || contended_compaction) && 2604 if ((deferred_compaction || contended_compaction) &&
2603 (gfp_mask & __GFP_NO_KSWAPD)) 2605 (gfp_mask & __GFP_NO_KSWAPD))
2604 goto nopage; 2606 goto nopage;
2605 2607
2606 /* Try direct reclaim and then allocating */ 2608 /* Try direct reclaim and then allocating */
2607 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2609 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2608 zonelist, high_zoneidx, 2610 zonelist, high_zoneidx,
2609 nodemask, 2611 nodemask,
2610 alloc_flags, preferred_zone, 2612 alloc_flags, preferred_zone,
2611 classzone_idx, migratetype, 2613 classzone_idx, migratetype,
2612 &did_some_progress); 2614 &did_some_progress);
2613 if (page) 2615 if (page)
2614 goto got_pg; 2616 goto got_pg;
2615 2617
2616 /* 2618 /*
2617 * If we failed to make any progress reclaiming, then we are 2619 * If we failed to make any progress reclaiming, then we are
2618 * running out of options and have to consider going OOM 2620 * running out of options and have to consider going OOM
2619 */ 2621 */
2620 if (!did_some_progress) { 2622 if (!did_some_progress) {
2621 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2623 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2622 if (oom_killer_disabled) 2624 if (oom_killer_disabled)
2623 goto nopage; 2625 goto nopage;
2624 /* Coredumps can quickly deplete all memory reserves */ 2626 /* Coredumps can quickly deplete all memory reserves */
2625 if ((current->flags & PF_DUMPCORE) && 2627 if ((current->flags & PF_DUMPCORE) &&
2626 !(gfp_mask & __GFP_NOFAIL)) 2628 !(gfp_mask & __GFP_NOFAIL))
2627 goto nopage; 2629 goto nopage;
2628 page = __alloc_pages_may_oom(gfp_mask, order, 2630 page = __alloc_pages_may_oom(gfp_mask, order,
2629 zonelist, high_zoneidx, 2631 zonelist, high_zoneidx,
2630 nodemask, preferred_zone, 2632 nodemask, preferred_zone,
2631 classzone_idx, migratetype); 2633 classzone_idx, migratetype);
2632 if (page) 2634 if (page)
2633 goto got_pg; 2635 goto got_pg;
2634 2636
2635 if (!(gfp_mask & __GFP_NOFAIL)) { 2637 if (!(gfp_mask & __GFP_NOFAIL)) {
2636 /* 2638 /*
2637 * The oom killer is not called for high-order 2639 * The oom killer is not called for high-order
2638 * allocations that may fail, so if no progress 2640 * allocations that may fail, so if no progress
2639 * is being made, there are no other options and 2641 * is being made, there are no other options and
2640 * retrying is unlikely to help. 2642 * retrying is unlikely to help.
2641 */ 2643 */
2642 if (order > PAGE_ALLOC_COSTLY_ORDER) 2644 if (order > PAGE_ALLOC_COSTLY_ORDER)
2643 goto nopage; 2645 goto nopage;
2644 /* 2646 /*
2645 * The oom killer is not called for lowmem 2647 * The oom killer is not called for lowmem
2646 * allocations to prevent needlessly killing 2648 * allocations to prevent needlessly killing
2647 * innocent tasks. 2649 * innocent tasks.
2648 */ 2650 */
2649 if (high_zoneidx < ZONE_NORMAL) 2651 if (high_zoneidx < ZONE_NORMAL)
2650 goto nopage; 2652 goto nopage;
2651 } 2653 }
2652 2654
2653 goto restart; 2655 goto restart;
2654 } 2656 }
2655 } 2657 }
2656 2658
2657 /* Check if we should retry the allocation */ 2659 /* Check if we should retry the allocation */
2658 pages_reclaimed += did_some_progress; 2660 pages_reclaimed += did_some_progress;
2659 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2661 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2660 pages_reclaimed)) { 2662 pages_reclaimed)) {
2661 /* Wait for some write requests to complete then retry */ 2663 /* Wait for some write requests to complete then retry */
2662 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2664 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2663 goto rebalance; 2665 goto rebalance;
2664 } else { 2666 } else {
2665 /* 2667 /*
2666 * High-order allocations do not necessarily loop after 2668 * High-order allocations do not necessarily loop after
2667 * direct reclaim and reclaim/compaction depends on compaction 2669 * direct reclaim and reclaim/compaction depends on compaction
2668 * being called after reclaim so call directly if necessary 2670 * being called after reclaim so call directly if necessary
2669 */ 2671 */
2670 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2672 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2671 high_zoneidx, nodemask, alloc_flags, 2673 high_zoneidx, nodemask, alloc_flags,
2672 preferred_zone, 2674 preferred_zone,
2673 classzone_idx, migratetype, 2675 classzone_idx, migratetype,
2674 migration_mode, &contended_compaction, 2676 migration_mode, &contended_compaction,
2675 &deferred_compaction, 2677 &deferred_compaction,
2676 &did_some_progress); 2678 &did_some_progress);
2677 if (page) 2679 if (page)
2678 goto got_pg; 2680 goto got_pg;
2679 } 2681 }
2680 2682
2681 nopage: 2683 nopage:
2682 warn_alloc_failed(gfp_mask, order, NULL); 2684 warn_alloc_failed(gfp_mask, order, NULL);
2683 return page; 2685 return page;
2684 got_pg: 2686 got_pg:
2685 if (kmemcheck_enabled) 2687 if (kmemcheck_enabled)
2686 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2688 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2687 2689
2688 return page; 2690 return page;
2689 } 2691 }
2690 2692
2691 /* 2693 /*
2692 * This is the 'heart' of the zoned buddy allocator. 2694 * This is the 'heart' of the zoned buddy allocator.
2693 */ 2695 */
2694 struct page * 2696 struct page *
2695 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2697 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2696 struct zonelist *zonelist, nodemask_t *nodemask) 2698 struct zonelist *zonelist, nodemask_t *nodemask)
2697 { 2699 {
2698 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2700 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2699 struct zone *preferred_zone; 2701 struct zone *preferred_zone;
2700 struct zoneref *preferred_zoneref; 2702 struct zoneref *preferred_zoneref;
2701 struct page *page = NULL; 2703 struct page *page = NULL;
2702 int migratetype = allocflags_to_migratetype(gfp_mask); 2704 int migratetype = allocflags_to_migratetype(gfp_mask);
2703 unsigned int cpuset_mems_cookie; 2705 unsigned int cpuset_mems_cookie;
2704 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2706 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2705 struct mem_cgroup *memcg = NULL; 2707 struct mem_cgroup *memcg = NULL;
2706 int classzone_idx; 2708 int classzone_idx;
2707 2709
2708 gfp_mask &= gfp_allowed_mask; 2710 gfp_mask &= gfp_allowed_mask;
2709 2711
2710 lockdep_trace_alloc(gfp_mask); 2712 lockdep_trace_alloc(gfp_mask);
2711 2713
2712 might_sleep_if(gfp_mask & __GFP_WAIT); 2714 might_sleep_if(gfp_mask & __GFP_WAIT);
2713 2715
2714 if (should_fail_alloc_page(gfp_mask, order)) 2716 if (should_fail_alloc_page(gfp_mask, order))
2715 return NULL; 2717 return NULL;
2716 2718
2717 /* 2719 /*
2718 * Check the zones suitable for the gfp_mask contain at least one 2720 * Check the zones suitable for the gfp_mask contain at least one
2719 * valid zone. It's possible to have an empty zonelist as a result 2721 * valid zone. It's possible to have an empty zonelist as a result
2720 * of GFP_THISNODE and a memoryless node 2722 * of GFP_THISNODE and a memoryless node
2721 */ 2723 */
2722 if (unlikely(!zonelist->_zonerefs->zone)) 2724 if (unlikely(!zonelist->_zonerefs->zone))
2723 return NULL; 2725 return NULL;
2724 2726
2725 /* 2727 /*
2726 * Will only have any effect when __GFP_KMEMCG is set. This is 2728 * Will only have any effect when __GFP_KMEMCG is set. This is
2727 * verified in the (always inline) callee 2729 * verified in the (always inline) callee
2728 */ 2730 */
2729 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2731 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2730 return NULL; 2732 return NULL;
2731 2733
2732 retry_cpuset: 2734 retry_cpuset:
2733 cpuset_mems_cookie = read_mems_allowed_begin(); 2735 cpuset_mems_cookie = read_mems_allowed_begin();
2734 2736
2735 /* The preferred zone is used for statistics later */ 2737 /* The preferred zone is used for statistics later */
2736 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2738 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2737 nodemask ? : &cpuset_current_mems_allowed, 2739 nodemask ? : &cpuset_current_mems_allowed,
2738 &preferred_zone); 2740 &preferred_zone);
2739 if (!preferred_zone) 2741 if (!preferred_zone)
2740 goto out; 2742 goto out;
2741 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2743 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2742 2744
2743 #ifdef CONFIG_CMA 2745 #ifdef CONFIG_CMA
2744 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2746 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2745 alloc_flags |= ALLOC_CMA; 2747 alloc_flags |= ALLOC_CMA;
2746 #endif 2748 #endif
2747 retry: 2749 retry:
2748 /* First allocation attempt */ 2750 /* First allocation attempt */
2749 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2751 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2750 zonelist, high_zoneidx, alloc_flags, 2752 zonelist, high_zoneidx, alloc_flags,
2751 preferred_zone, classzone_idx, migratetype); 2753 preferred_zone, classzone_idx, migratetype);
2752 if (unlikely(!page)) { 2754 if (unlikely(!page)) {
2753 /* 2755 /*
2754 * The first pass makes sure allocations are spread 2756 * The first pass makes sure allocations are spread
2755 * fairly within the local node. However, the local 2757 * fairly within the local node. However, the local
2756 * node might have free pages left after the fairness 2758 * node might have free pages left after the fairness
2757 * batches are exhausted, and remote zones haven't 2759 * batches are exhausted, and remote zones haven't
2758 * even been considered yet. Try once more without 2760 * even been considered yet. Try once more without
2759 * fairness, and include remote zones now, before 2761 * fairness, and include remote zones now, before
2760 * entering the slowpath and waking kswapd: prefer 2762 * entering the slowpath and waking kswapd: prefer
2761 * spilling to a remote zone over swapping locally. 2763 * spilling to a remote zone over swapping locally.
2762 */ 2764 */
2763 if (alloc_flags & ALLOC_FAIR) { 2765 if (alloc_flags & ALLOC_FAIR) {
2764 reset_alloc_batches(zonelist, high_zoneidx, 2766 reset_alloc_batches(zonelist, high_zoneidx,
2765 preferred_zone); 2767 preferred_zone);
2766 alloc_flags &= ~ALLOC_FAIR; 2768 alloc_flags &= ~ALLOC_FAIR;
2767 goto retry; 2769 goto retry;
2768 } 2770 }
2769 /* 2771 /*
2770 * Runtime PM, block IO and its error handling path 2772 * Runtime PM, block IO and its error handling path
2771 * can deadlock because I/O on the device might not 2773 * can deadlock because I/O on the device might not
2772 * complete. 2774 * complete.
2773 */ 2775 */
2774 gfp_mask = memalloc_noio_flags(gfp_mask); 2776 gfp_mask = memalloc_noio_flags(gfp_mask);
2775 page = __alloc_pages_slowpath(gfp_mask, order, 2777 page = __alloc_pages_slowpath(gfp_mask, order,
2776 zonelist, high_zoneidx, nodemask, 2778 zonelist, high_zoneidx, nodemask,
2777 preferred_zone, classzone_idx, migratetype); 2779 preferred_zone, classzone_idx, migratetype);
2778 } 2780 }
2779 2781
2780 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2782 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2781 2783
2782 out: 2784 out:
2783 /* 2785 /*
2784 * When updating a task's mems_allowed, it is possible to race with 2786 * When updating a task's mems_allowed, it is possible to race with
2785 * parallel threads in such a way that an allocation can fail while 2787 * parallel threads in such a way that an allocation can fail while
2786 * the mask is being updated. If a page allocation is about to fail, 2788 * the mask is being updated. If a page allocation is about to fail,
2787 * check if the cpuset changed during allocation and if so, retry. 2789 * check if the cpuset changed during allocation and if so, retry.
2788 */ 2790 */
2789 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2791 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2790 goto retry_cpuset; 2792 goto retry_cpuset;
2791 2793
2792 memcg_kmem_commit_charge(page, memcg, order); 2794 memcg_kmem_commit_charge(page, memcg, order);
2793 2795
2794 return page; 2796 return page;
2795 } 2797 }
2796 EXPORT_SYMBOL(__alloc_pages_nodemask); 2798 EXPORT_SYMBOL(__alloc_pages_nodemask);
2797 2799
2798 /* 2800 /*
2799 * Common helper functions. 2801 * Common helper functions.
2800 */ 2802 */
2801 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2803 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2802 { 2804 {
2803 struct page *page; 2805 struct page *page;
2804 2806
2805 /* 2807 /*
2806 * __get_free_pages() returns a 32-bit address, which cannot represent 2808 * __get_free_pages() returns a 32-bit address, which cannot represent
2807 * a highmem page 2809 * a highmem page
2808 */ 2810 */
2809 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2811 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2810 2812
2811 page = alloc_pages(gfp_mask, order); 2813 page = alloc_pages(gfp_mask, order);
2812 if (!page) 2814 if (!page)
2813 return 0; 2815 return 0;
2814 return (unsigned long) page_address(page); 2816 return (unsigned long) page_address(page);
2815 } 2817 }
2816 EXPORT_SYMBOL(__get_free_pages); 2818 EXPORT_SYMBOL(__get_free_pages);
2817 2819
2818 unsigned long get_zeroed_page(gfp_t gfp_mask) 2820 unsigned long get_zeroed_page(gfp_t gfp_mask)
2819 { 2821 {
2820 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2822 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2821 } 2823 }
2822 EXPORT_SYMBOL(get_zeroed_page); 2824 EXPORT_SYMBOL(get_zeroed_page);
2823 2825
2824 void __free_pages(struct page *page, unsigned int order) 2826 void __free_pages(struct page *page, unsigned int order)
2825 { 2827 {
2826 if (put_page_testzero(page)) { 2828 if (put_page_testzero(page)) {
2827 if (order == 0) 2829 if (order == 0)
2828 free_hot_cold_page(page, 0); 2830 free_hot_cold_page(page, 0);
2829 else 2831 else
2830 __free_pages_ok(page, order); 2832 __free_pages_ok(page, order);
2831 } 2833 }
2832 } 2834 }
2833 2835
2834 EXPORT_SYMBOL(__free_pages); 2836 EXPORT_SYMBOL(__free_pages);
2835 2837
2836 void free_pages(unsigned long addr, unsigned int order) 2838 void free_pages(unsigned long addr, unsigned int order)
2837 { 2839 {
2838 if (addr != 0) { 2840 if (addr != 0) {
2839 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2841 VM_BUG_ON(!virt_addr_valid((void *)addr));
2840 __free_pages(virt_to_page((void *)addr), order); 2842 __free_pages(virt_to_page((void *)addr), order);
2841 } 2843 }
2842 } 2844 }
2843 2845
2844 EXPORT_SYMBOL(free_pages); 2846 EXPORT_SYMBOL(free_pages);
2845 2847
2846 /* 2848 /*
2847 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2849 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2848 * pages allocated with __GFP_KMEMCG. 2850 * pages allocated with __GFP_KMEMCG.
2849 * 2851 *
2850 * Those pages are accounted to a particular memcg, embedded in the 2852 * Those pages are accounted to a particular memcg, embedded in the
2851 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2853 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2852 * for that information only to find out that it is NULL for users who have no 2854 * for that information only to find out that it is NULL for users who have no
2853 * interest in that whatsoever, we provide these functions. 2855 * interest in that whatsoever, we provide these functions.
2854 * 2856 *
2855 * The caller knows better which flags it relies on. 2857 * The caller knows better which flags it relies on.
2856 */ 2858 */
2857 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2859 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2858 { 2860 {
2859 memcg_kmem_uncharge_pages(page, order); 2861 memcg_kmem_uncharge_pages(page, order);
2860 __free_pages(page, order); 2862 __free_pages(page, order);
2861 } 2863 }
2862 2864
2863 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2865 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2864 { 2866 {
2865 if (addr != 0) { 2867 if (addr != 0) {
2866 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2868 VM_BUG_ON(!virt_addr_valid((void *)addr));
2867 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2869 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2868 } 2870 }
2869 } 2871 }
2870 2872
2871 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2873 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2872 { 2874 {
2873 if (addr) { 2875 if (addr) {
2874 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2876 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2875 unsigned long used = addr + PAGE_ALIGN(size); 2877 unsigned long used = addr + PAGE_ALIGN(size);
2876 2878
2877 split_page(virt_to_page((void *)addr), order); 2879 split_page(virt_to_page((void *)addr), order);
2878 while (used < alloc_end) { 2880 while (used < alloc_end) {
2879 free_page(used); 2881 free_page(used);
2880 used += PAGE_SIZE; 2882 used += PAGE_SIZE;
2881 } 2883 }
2882 } 2884 }
2883 return (void *)addr; 2885 return (void *)addr;
2884 } 2886 }
2885 2887
2886 /** 2888 /**
2887 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2889 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2888 * @size: the number of bytes to allocate 2890 * @size: the number of bytes to allocate
2889 * @gfp_mask: GFP flags for the allocation 2891 * @gfp_mask: GFP flags for the allocation
2890 * 2892 *
2891 * This function is similar to alloc_pages(), except that it allocates the 2893 * This function is similar to alloc_pages(), except that it allocates the
2892 * minimum number of pages to satisfy the request. alloc_pages() can only 2894 * minimum number of pages to satisfy the request. alloc_pages() can only
2893 * allocate memory in power-of-two pages. 2895 * allocate memory in power-of-two pages.
2894 * 2896 *
2895 * This function is also limited by MAX_ORDER. 2897 * This function is also limited by MAX_ORDER.
2896 * 2898 *
2897 * Memory allocated by this function must be released by free_pages_exact(). 2899 * Memory allocated by this function must be released by free_pages_exact().
2898 */ 2900 */
2899 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2901 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2900 { 2902 {
2901 unsigned int order = get_order(size); 2903 unsigned int order = get_order(size);
2902 unsigned long addr; 2904 unsigned long addr;
2903 2905
2904 addr = __get_free_pages(gfp_mask, order); 2906 addr = __get_free_pages(gfp_mask, order);
2905 return make_alloc_exact(addr, order, size); 2907 return make_alloc_exact(addr, order, size);
2906 } 2908 }
2907 EXPORT_SYMBOL(alloc_pages_exact); 2909 EXPORT_SYMBOL(alloc_pages_exact);
2908 2910
2909 /** 2911 /**
2910 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2912 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2911 * pages on a node. 2913 * pages on a node.
2912 * @nid: the preferred node ID where memory should be allocated 2914 * @nid: the preferred node ID where memory should be allocated
2913 * @size: the number of bytes to allocate 2915 * @size: the number of bytes to allocate
2914 * @gfp_mask: GFP flags for the allocation 2916 * @gfp_mask: GFP flags for the allocation
2915 * 2917 *
2916 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2918 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2917 * back. 2919 * back.
2918 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2920 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2919 * but is not exact. 2921 * but is not exact.
2920 */ 2922 */
2921 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2923 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2922 { 2924 {
2923 unsigned order = get_order(size); 2925 unsigned order = get_order(size);
2924 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2926 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2925 if (!p) 2927 if (!p)
2926 return NULL; 2928 return NULL;
2927 return make_alloc_exact((unsigned long)page_address(p), order, size); 2929 return make_alloc_exact((unsigned long)page_address(p), order, size);
2928 } 2930 }
2929 EXPORT_SYMBOL(alloc_pages_exact_nid); 2931 EXPORT_SYMBOL(alloc_pages_exact_nid);
2930 2932
2931 /** 2933 /**
2932 * free_pages_exact - release memory allocated via alloc_pages_exact() 2934 * free_pages_exact - release memory allocated via alloc_pages_exact()
2933 * @virt: the value returned by alloc_pages_exact. 2935 * @virt: the value returned by alloc_pages_exact.
2934 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2936 * @size: size of allocation, same value as passed to alloc_pages_exact().
2935 * 2937 *
2936 * Release the memory allocated by a previous call to alloc_pages_exact. 2938 * Release the memory allocated by a previous call to alloc_pages_exact.
2937 */ 2939 */
2938 void free_pages_exact(void *virt, size_t size) 2940 void free_pages_exact(void *virt, size_t size)
2939 { 2941 {
2940 unsigned long addr = (unsigned long)virt; 2942 unsigned long addr = (unsigned long)virt;
2941 unsigned long end = addr + PAGE_ALIGN(size); 2943 unsigned long end = addr + PAGE_ALIGN(size);
2942 2944
2943 while (addr < end) { 2945 while (addr < end) {
2944 free_page(addr); 2946 free_page(addr);
2945 addr += PAGE_SIZE; 2947 addr += PAGE_SIZE;
2946 } 2948 }
2947 } 2949 }
2948 EXPORT_SYMBOL(free_pages_exact); 2950 EXPORT_SYMBOL(free_pages_exact);
2949 2951
2950 /** 2952 /**
2951 * nr_free_zone_pages - count number of pages beyond high watermark 2953 * nr_free_zone_pages - count number of pages beyond high watermark
2952 * @offset: The zone index of the highest zone 2954 * @offset: The zone index of the highest zone
2953 * 2955 *
2954 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2956 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2955 * high watermark within all zones at or below a given zone index. For each 2957 * high watermark within all zones at or below a given zone index. For each
2956 * zone, the number of pages is calculated as: 2958 * zone, the number of pages is calculated as:
2957 * managed_pages - high_pages 2959 * managed_pages - high_pages
2958 */ 2960 */
2959 static unsigned long nr_free_zone_pages(int offset) 2961 static unsigned long nr_free_zone_pages(int offset)
2960 { 2962 {
2961 struct zoneref *z; 2963 struct zoneref *z;
2962 struct zone *zone; 2964 struct zone *zone;
2963 2965
2964 /* Just pick one node, since fallback list is circular */ 2966 /* Just pick one node, since fallback list is circular */
2965 unsigned long sum = 0; 2967 unsigned long sum = 0;
2966 2968
2967 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2969 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2968 2970
2969 for_each_zone_zonelist(zone, z, zonelist, offset) { 2971 for_each_zone_zonelist(zone, z, zonelist, offset) {
2970 unsigned long size = zone->managed_pages; 2972 unsigned long size = zone->managed_pages;
2971 unsigned long high = high_wmark_pages(zone); 2973 unsigned long high = high_wmark_pages(zone);
2972 if (size > high) 2974 if (size > high)
2973 sum += size - high; 2975 sum += size - high;
2974 } 2976 }
2975 2977
2976 return sum; 2978 return sum;
2977 } 2979 }
2978 2980
2979 /** 2981 /**
2980 * nr_free_buffer_pages - count number of pages beyond high watermark 2982 * nr_free_buffer_pages - count number of pages beyond high watermark
2981 * 2983 *
2982 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2984 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2983 * watermark within ZONE_DMA and ZONE_NORMAL. 2985 * watermark within ZONE_DMA and ZONE_NORMAL.
2984 */ 2986 */
2985 unsigned long nr_free_buffer_pages(void) 2987 unsigned long nr_free_buffer_pages(void)
2986 { 2988 {
2987 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2989 return nr_free_zone_pages(gfp_zone(GFP_USER));
2988 } 2990 }
2989 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2991 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2990 2992
2991 /** 2993 /**
2992 * nr_free_pagecache_pages - count number of pages beyond high watermark 2994 * nr_free_pagecache_pages - count number of pages beyond high watermark
2993 * 2995 *
2994 * nr_free_pagecache_pages() counts the number of pages which are beyond the 2996 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2995 * high watermark within all zones. 2997 * high watermark within all zones.
2996 */ 2998 */
2997 unsigned long nr_free_pagecache_pages(void) 2999 unsigned long nr_free_pagecache_pages(void)
2998 { 3000 {
2999 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 3001 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
3000 } 3002 }
3001 3003
3002 static inline void show_node(struct zone *zone) 3004 static inline void show_node(struct zone *zone)
3003 { 3005 {
3004 if (IS_ENABLED(CONFIG_NUMA)) 3006 if (IS_ENABLED(CONFIG_NUMA))
3005 printk("Node %d ", zone_to_nid(zone)); 3007 printk("Node %d ", zone_to_nid(zone));
3006 } 3008 }
3007 3009
3008 void si_meminfo(struct sysinfo *val) 3010 void si_meminfo(struct sysinfo *val)
3009 { 3011 {
3010 val->totalram = totalram_pages; 3012 val->totalram = totalram_pages;
3011 val->sharedram = 0; 3013 val->sharedram = 0;
3012 val->freeram = global_page_state(NR_FREE_PAGES); 3014 val->freeram = global_page_state(NR_FREE_PAGES);
3013 val->bufferram = nr_blockdev_pages(); 3015 val->bufferram = nr_blockdev_pages();
3014 val->totalhigh = totalhigh_pages; 3016 val->totalhigh = totalhigh_pages;
3015 val->freehigh = nr_free_highpages(); 3017 val->freehigh = nr_free_highpages();
3016 val->mem_unit = PAGE_SIZE; 3018 val->mem_unit = PAGE_SIZE;
3017 } 3019 }
3018 3020
3019 EXPORT_SYMBOL(si_meminfo); 3021 EXPORT_SYMBOL(si_meminfo);
3020 3022
3021 #ifdef CONFIG_NUMA 3023 #ifdef CONFIG_NUMA
3022 void si_meminfo_node(struct sysinfo *val, int nid) 3024 void si_meminfo_node(struct sysinfo *val, int nid)
3023 { 3025 {
3024 int zone_type; /* needs to be signed */ 3026 int zone_type; /* needs to be signed */
3025 unsigned long managed_pages = 0; 3027 unsigned long managed_pages = 0;
3026 pg_data_t *pgdat = NODE_DATA(nid); 3028 pg_data_t *pgdat = NODE_DATA(nid);
3027 3029
3028 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3030 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3029 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3031 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3030 val->totalram = managed_pages; 3032 val->totalram = managed_pages;
3031 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3033 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3032 #ifdef CONFIG_HIGHMEM 3034 #ifdef CONFIG_HIGHMEM
3033 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3035 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
3034 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3036 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3035 NR_FREE_PAGES); 3037 NR_FREE_PAGES);
3036 #else 3038 #else
3037 val->totalhigh = 0; 3039 val->totalhigh = 0;
3038 val->freehigh = 0; 3040 val->freehigh = 0;
3039 #endif 3041 #endif
3040 val->mem_unit = PAGE_SIZE; 3042 val->mem_unit = PAGE_SIZE;
3041 } 3043 }
3042 #endif 3044 #endif
3043 3045
3044 /* 3046 /*
3045 * Determine whether the node should be displayed or not, depending on whether 3047 * Determine whether the node should be displayed or not, depending on whether
3046 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3048 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3047 */ 3049 */
3048 bool skip_free_areas_node(unsigned int flags, int nid) 3050 bool skip_free_areas_node(unsigned int flags, int nid)
3049 { 3051 {
3050 bool ret = false; 3052 bool ret = false;
3051 unsigned int cpuset_mems_cookie; 3053 unsigned int cpuset_mems_cookie;
3052 3054
3053 if (!(flags & SHOW_MEM_FILTER_NODES)) 3055 if (!(flags & SHOW_MEM_FILTER_NODES))
3054 goto out; 3056 goto out;
3055 3057
3056 do { 3058 do {
3057 cpuset_mems_cookie = read_mems_allowed_begin(); 3059 cpuset_mems_cookie = read_mems_allowed_begin();
3058 ret = !node_isset(nid, cpuset_current_mems_allowed); 3060 ret = !node_isset(nid, cpuset_current_mems_allowed);
3059 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3061 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3060 out: 3062 out:
3061 return ret; 3063 return ret;
3062 } 3064 }
3063 3065
3064 #define K(x) ((x) << (PAGE_SHIFT-10)) 3066 #define K(x) ((x) << (PAGE_SHIFT-10))
3065 3067
3066 static void show_migration_types(unsigned char type) 3068 static void show_migration_types(unsigned char type)
3067 { 3069 {
3068 static const char types[MIGRATE_TYPES] = { 3070 static const char types[MIGRATE_TYPES] = {
3069 [MIGRATE_UNMOVABLE] = 'U', 3071 [MIGRATE_UNMOVABLE] = 'U',
3070 [MIGRATE_RECLAIMABLE] = 'E', 3072 [MIGRATE_RECLAIMABLE] = 'E',
3071 [MIGRATE_MOVABLE] = 'M', 3073 [MIGRATE_MOVABLE] = 'M',
3072 [MIGRATE_RESERVE] = 'R', 3074 [MIGRATE_RESERVE] = 'R',
3073 #ifdef CONFIG_CMA 3075 #ifdef CONFIG_CMA
3074 [MIGRATE_CMA] = 'C', 3076 [MIGRATE_CMA] = 'C',
3075 #endif 3077 #endif
3076 #ifdef CONFIG_MEMORY_ISOLATION 3078 #ifdef CONFIG_MEMORY_ISOLATION
3077 [MIGRATE_ISOLATE] = 'I', 3079 [MIGRATE_ISOLATE] = 'I',
3078 #endif 3080 #endif
3079 }; 3081 };
3080 char tmp[MIGRATE_TYPES + 1]; 3082 char tmp[MIGRATE_TYPES + 1];
3081 char *p = tmp; 3083 char *p = tmp;
3082 int i; 3084 int i;
3083 3085
3084 for (i = 0; i < MIGRATE_TYPES; i++) { 3086 for (i = 0; i < MIGRATE_TYPES; i++) {
3085 if (type & (1 << i)) 3087 if (type & (1 << i))
3086 *p++ = types[i]; 3088 *p++ = types[i];
3087 } 3089 }
3088 3090
3089 *p = '\0'; 3091 *p = '\0';
3090 printk("(%s) ", tmp); 3092 printk("(%s) ", tmp);
3091 } 3093 }
3092 3094
3093 /* 3095 /*
3094 * Show free area list (used inside shift_scroll-lock stuff) 3096 * Show free area list (used inside shift_scroll-lock stuff)
3095 * We also calculate the percentage fragmentation. We do this by counting the 3097 * We also calculate the percentage fragmentation. We do this by counting the
3096 * memory on each free list with the exception of the first item on the list. 3098 * memory on each free list with the exception of the first item on the list.
3097 * Suppresses nodes that are not allowed by current's cpuset if 3099 * Suppresses nodes that are not allowed by current's cpuset if
3098 * SHOW_MEM_FILTER_NODES is passed. 3100 * SHOW_MEM_FILTER_NODES is passed.
3099 */ 3101 */
3100 void show_free_areas(unsigned int filter) 3102 void show_free_areas(unsigned int filter)
3101 { 3103 {
3102 int cpu; 3104 int cpu;
3103 struct zone *zone; 3105 struct zone *zone;
3104 3106
3105 for_each_populated_zone(zone) { 3107 for_each_populated_zone(zone) {
3106 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3108 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3107 continue; 3109 continue;
3108 show_node(zone); 3110 show_node(zone);
3109 printk("%s per-cpu:\n", zone->name); 3111 printk("%s per-cpu:\n", zone->name);
3110 3112
3111 for_each_online_cpu(cpu) { 3113 for_each_online_cpu(cpu) {
3112 struct per_cpu_pageset *pageset; 3114 struct per_cpu_pageset *pageset;
3113 3115
3114 pageset = per_cpu_ptr(zone->pageset, cpu); 3116 pageset = per_cpu_ptr(zone->pageset, cpu);
3115 3117
3116 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3118 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3117 cpu, pageset->pcp.high, 3119 cpu, pageset->pcp.high,
3118 pageset->pcp.batch, pageset->pcp.count); 3120 pageset->pcp.batch, pageset->pcp.count);
3119 } 3121 }
3120 } 3122 }
3121 3123
3122 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3124 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3123 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3125 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3124 " unevictable:%lu" 3126 " unevictable:%lu"
3125 " dirty:%lu writeback:%lu unstable:%lu\n" 3127 " dirty:%lu writeback:%lu unstable:%lu\n"
3126 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3128 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3127 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3129 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3128 " free_cma:%lu\n", 3130 " free_cma:%lu\n",
3129 global_page_state(NR_ACTIVE_ANON), 3131 global_page_state(NR_ACTIVE_ANON),
3130 global_page_state(NR_INACTIVE_ANON), 3132 global_page_state(NR_INACTIVE_ANON),
3131 global_page_state(NR_ISOLATED_ANON), 3133 global_page_state(NR_ISOLATED_ANON),
3132 global_page_state(NR_ACTIVE_FILE), 3134 global_page_state(NR_ACTIVE_FILE),
3133 global_page_state(NR_INACTIVE_FILE), 3135 global_page_state(NR_INACTIVE_FILE),
3134 global_page_state(NR_ISOLATED_FILE), 3136 global_page_state(NR_ISOLATED_FILE),
3135 global_page_state(NR_UNEVICTABLE), 3137 global_page_state(NR_UNEVICTABLE),
3136 global_page_state(NR_FILE_DIRTY), 3138 global_page_state(NR_FILE_DIRTY),
3137 global_page_state(NR_WRITEBACK), 3139 global_page_state(NR_WRITEBACK),
3138 global_page_state(NR_UNSTABLE_NFS), 3140 global_page_state(NR_UNSTABLE_NFS),
3139 global_page_state(NR_FREE_PAGES), 3141 global_page_state(NR_FREE_PAGES),
3140 global_page_state(NR_SLAB_RECLAIMABLE), 3142 global_page_state(NR_SLAB_RECLAIMABLE),
3141 global_page_state(NR_SLAB_UNRECLAIMABLE), 3143 global_page_state(NR_SLAB_UNRECLAIMABLE),
3142 global_page_state(NR_FILE_MAPPED), 3144 global_page_state(NR_FILE_MAPPED),
3143 global_page_state(NR_SHMEM), 3145 global_page_state(NR_SHMEM),
3144 global_page_state(NR_PAGETABLE), 3146 global_page_state(NR_PAGETABLE),
3145 global_page_state(NR_BOUNCE), 3147 global_page_state(NR_BOUNCE),
3146 global_page_state(NR_FREE_CMA_PAGES)); 3148 global_page_state(NR_FREE_CMA_PAGES));
3147 3149
3148 for_each_populated_zone(zone) { 3150 for_each_populated_zone(zone) {
3149 int i; 3151 int i;
3150 3152
3151 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3153 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3152 continue; 3154 continue;
3153 show_node(zone); 3155 show_node(zone);
3154 printk("%s" 3156 printk("%s"
3155 " free:%lukB" 3157 " free:%lukB"
3156 " min:%lukB" 3158 " min:%lukB"
3157 " low:%lukB" 3159 " low:%lukB"
3158 " high:%lukB" 3160 " high:%lukB"
3159 " active_anon:%lukB" 3161 " active_anon:%lukB"
3160 " inactive_anon:%lukB" 3162 " inactive_anon:%lukB"
3161 " active_file:%lukB" 3163 " active_file:%lukB"
3162 " inactive_file:%lukB" 3164 " inactive_file:%lukB"
3163 " unevictable:%lukB" 3165 " unevictable:%lukB"
3164 " isolated(anon):%lukB" 3166 " isolated(anon):%lukB"
3165 " isolated(file):%lukB" 3167 " isolated(file):%lukB"
3166 " present:%lukB" 3168 " present:%lukB"
3167 " managed:%lukB" 3169 " managed:%lukB"
3168 " mlocked:%lukB" 3170 " mlocked:%lukB"
3169 " dirty:%lukB" 3171 " dirty:%lukB"
3170 " writeback:%lukB" 3172 " writeback:%lukB"
3171 " mapped:%lukB" 3173 " mapped:%lukB"
3172 " shmem:%lukB" 3174 " shmem:%lukB"
3173 " slab_reclaimable:%lukB" 3175 " slab_reclaimable:%lukB"
3174 " slab_unreclaimable:%lukB" 3176 " slab_unreclaimable:%lukB"
3175 " kernel_stack:%lukB" 3177 " kernel_stack:%lukB"
3176 " pagetables:%lukB" 3178 " pagetables:%lukB"
3177 " unstable:%lukB" 3179 " unstable:%lukB"
3178 " bounce:%lukB" 3180 " bounce:%lukB"
3179 " free_cma:%lukB" 3181 " free_cma:%lukB"
3180 " writeback_tmp:%lukB" 3182 " writeback_tmp:%lukB"
3181 " pages_scanned:%lu" 3183 " pages_scanned:%lu"
3182 " all_unreclaimable? %s" 3184 " all_unreclaimable? %s"
3183 "\n", 3185 "\n",
3184 zone->name, 3186 zone->name,
3185 K(zone_page_state(zone, NR_FREE_PAGES)), 3187 K(zone_page_state(zone, NR_FREE_PAGES)),
3186 K(min_wmark_pages(zone)), 3188 K(min_wmark_pages(zone)),
3187 K(low_wmark_pages(zone)), 3189 K(low_wmark_pages(zone)),
3188 K(high_wmark_pages(zone)), 3190 K(high_wmark_pages(zone)),
3189 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3191 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3190 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3192 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3191 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3193 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3192 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3194 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3193 K(zone_page_state(zone, NR_UNEVICTABLE)), 3195 K(zone_page_state(zone, NR_UNEVICTABLE)),
3194 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3196 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3195 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3197 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3196 K(zone->present_pages), 3198 K(zone->present_pages),
3197 K(zone->managed_pages), 3199 K(zone->managed_pages),
3198 K(zone_page_state(zone, NR_MLOCK)), 3200 K(zone_page_state(zone, NR_MLOCK)),
3199 K(zone_page_state(zone, NR_FILE_DIRTY)), 3201 K(zone_page_state(zone, NR_FILE_DIRTY)),
3200 K(zone_page_state(zone, NR_WRITEBACK)), 3202 K(zone_page_state(zone, NR_WRITEBACK)),
3201 K(zone_page_state(zone, NR_FILE_MAPPED)), 3203 K(zone_page_state(zone, NR_FILE_MAPPED)),
3202 K(zone_page_state(zone, NR_SHMEM)), 3204 K(zone_page_state(zone, NR_SHMEM)),
3203 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3205 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3204 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3206 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3205 zone_page_state(zone, NR_KERNEL_STACK) * 3207 zone_page_state(zone, NR_KERNEL_STACK) *
3206 THREAD_SIZE / 1024, 3208 THREAD_SIZE / 1024,
3207 K(zone_page_state(zone, NR_PAGETABLE)), 3209 K(zone_page_state(zone, NR_PAGETABLE)),
3208 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3210 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3209 K(zone_page_state(zone, NR_BOUNCE)), 3211 K(zone_page_state(zone, NR_BOUNCE)),
3210 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3212 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3211 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3213 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3212 zone->pages_scanned, 3214 zone->pages_scanned,
3213 (!zone_reclaimable(zone) ? "yes" : "no") 3215 (!zone_reclaimable(zone) ? "yes" : "no")
3214 ); 3216 );
3215 printk("lowmem_reserve[]:"); 3217 printk("lowmem_reserve[]:");
3216 for (i = 0; i < MAX_NR_ZONES; i++) 3218 for (i = 0; i < MAX_NR_ZONES; i++)
3217 printk(" %lu", zone->lowmem_reserve[i]); 3219 printk(" %lu", zone->lowmem_reserve[i]);
3218 printk("\n"); 3220 printk("\n");
3219 } 3221 }
3220 3222
3221 for_each_populated_zone(zone) { 3223 for_each_populated_zone(zone) {
3222 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3224 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3223 unsigned char types[MAX_ORDER]; 3225 unsigned char types[MAX_ORDER];
3224 3226
3225 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3227 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3226 continue; 3228 continue;
3227 show_node(zone); 3229 show_node(zone);
3228 printk("%s: ", zone->name); 3230 printk("%s: ", zone->name);
3229 3231
3230 spin_lock_irqsave(&zone->lock, flags); 3232 spin_lock_irqsave(&zone->lock, flags);
3231 for (order = 0; order < MAX_ORDER; order++) { 3233 for (order = 0; order < MAX_ORDER; order++) {
3232 struct free_area *area = &zone->free_area[order]; 3234 struct free_area *area = &zone->free_area[order];
3233 int type; 3235 int type;
3234 3236
3235 nr[order] = area->nr_free; 3237 nr[order] = area->nr_free;
3236 total += nr[order] << order; 3238 total += nr[order] << order;
3237 3239
3238 types[order] = 0; 3240 types[order] = 0;
3239 for (type = 0; type < MIGRATE_TYPES; type++) { 3241 for (type = 0; type < MIGRATE_TYPES; type++) {
3240 if (!list_empty(&area->free_list[type])) 3242 if (!list_empty(&area->free_list[type]))
3241 types[order] |= 1 << type; 3243 types[order] |= 1 << type;
3242 } 3244 }
3243 } 3245 }
3244 spin_unlock_irqrestore(&zone->lock, flags); 3246 spin_unlock_irqrestore(&zone->lock, flags);
3245 for (order = 0; order < MAX_ORDER; order++) { 3247 for (order = 0; order < MAX_ORDER; order++) {
3246 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3248 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3247 if (nr[order]) 3249 if (nr[order])
3248 show_migration_types(types[order]); 3250 show_migration_types(types[order]);
3249 } 3251 }
3250 printk("= %lukB\n", K(total)); 3252 printk("= %lukB\n", K(total));
3251 } 3253 }
3252 3254
3253 hugetlb_show_meminfo(); 3255 hugetlb_show_meminfo();
3254 3256
3255 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3257 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3256 3258
3257 show_swap_cache_info(); 3259 show_swap_cache_info();
3258 } 3260 }
3259 3261
3260 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3262 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3261 { 3263 {
3262 zoneref->zone = zone; 3264 zoneref->zone = zone;
3263 zoneref->zone_idx = zone_idx(zone); 3265 zoneref->zone_idx = zone_idx(zone);
3264 } 3266 }
3265 3267
3266 /* 3268 /*
3267 * Builds allocation fallback zone lists. 3269 * Builds allocation fallback zone lists.
3268 * 3270 *
3269 * Add all populated zones of a node to the zonelist. 3271 * Add all populated zones of a node to the zonelist.
3270 */ 3272 */
3271 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3273 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3272 int nr_zones) 3274 int nr_zones)
3273 { 3275 {
3274 struct zone *zone; 3276 struct zone *zone;
3275 enum zone_type zone_type = MAX_NR_ZONES; 3277 enum zone_type zone_type = MAX_NR_ZONES;
3276 3278
3277 do { 3279 do {
3278 zone_type--; 3280 zone_type--;
3279 zone = pgdat->node_zones + zone_type; 3281 zone = pgdat->node_zones + zone_type;
3280 if (populated_zone(zone)) { 3282 if (populated_zone(zone)) {
3281 zoneref_set_zone(zone, 3283 zoneref_set_zone(zone,
3282 &zonelist->_zonerefs[nr_zones++]); 3284 &zonelist->_zonerefs[nr_zones++]);
3283 check_highest_zone(zone_type); 3285 check_highest_zone(zone_type);
3284 } 3286 }
3285 } while (zone_type); 3287 } while (zone_type);
3286 3288
3287 return nr_zones; 3289 return nr_zones;
3288 } 3290 }
3289 3291
3290 3292
3291 /* 3293 /*
3292 * zonelist_order: 3294 * zonelist_order:
3293 * 0 = automatic detection of better ordering. 3295 * 0 = automatic detection of better ordering.
3294 * 1 = order by ([node] distance, -zonetype) 3296 * 1 = order by ([node] distance, -zonetype)
3295 * 2 = order by (-zonetype, [node] distance) 3297 * 2 = order by (-zonetype, [node] distance)
3296 * 3298 *
3297 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3299 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3298 * the same zonelist. So only NUMA can configure this param. 3300 * the same zonelist. So only NUMA can configure this param.
3299 */ 3301 */
3300 #define ZONELIST_ORDER_DEFAULT 0 3302 #define ZONELIST_ORDER_DEFAULT 0
3301 #define ZONELIST_ORDER_NODE 1 3303 #define ZONELIST_ORDER_NODE 1
3302 #define ZONELIST_ORDER_ZONE 2 3304 #define ZONELIST_ORDER_ZONE 2
3303 3305
3304 /* zonelist order in the kernel. 3306 /* zonelist order in the kernel.
3305 * set_zonelist_order() will set this to NODE or ZONE. 3307 * set_zonelist_order() will set this to NODE or ZONE.
3306 */ 3308 */
3307 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3309 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3308 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3310 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3309 3311
3310 3312
3311 #ifdef CONFIG_NUMA 3313 #ifdef CONFIG_NUMA
3312 /* The value user specified ....changed by config */ 3314 /* The value user specified ....changed by config */
3313 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3315 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3314 /* string for sysctl */ 3316 /* string for sysctl */
3315 #define NUMA_ZONELIST_ORDER_LEN 16 3317 #define NUMA_ZONELIST_ORDER_LEN 16
3316 char numa_zonelist_order[16] = "default"; 3318 char numa_zonelist_order[16] = "default";
3317 3319
3318 /* 3320 /*
3319 * interface for configure zonelist ordering. 3321 * interface for configure zonelist ordering.
3320 * command line option "numa_zonelist_order" 3322 * command line option "numa_zonelist_order"
3321 * = "[dD]efault - default, automatic configuration. 3323 * = "[dD]efault - default, automatic configuration.
3322 * = "[nN]ode - order by node locality, then by zone within node 3324 * = "[nN]ode - order by node locality, then by zone within node
3323 * = "[zZ]one - order by zone, then by locality within zone 3325 * = "[zZ]one - order by zone, then by locality within zone
3324 */ 3326 */
3325 3327
3326 static int __parse_numa_zonelist_order(char *s) 3328 static int __parse_numa_zonelist_order(char *s)
3327 { 3329 {
3328 if (*s == 'd' || *s == 'D') { 3330 if (*s == 'd' || *s == 'D') {
3329 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3331 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3330 } else if (*s == 'n' || *s == 'N') { 3332 } else if (*s == 'n' || *s == 'N') {
3331 user_zonelist_order = ZONELIST_ORDER_NODE; 3333 user_zonelist_order = ZONELIST_ORDER_NODE;
3332 } else if (*s == 'z' || *s == 'Z') { 3334 } else if (*s == 'z' || *s == 'Z') {
3333 user_zonelist_order = ZONELIST_ORDER_ZONE; 3335 user_zonelist_order = ZONELIST_ORDER_ZONE;
3334 } else { 3336 } else {
3335 printk(KERN_WARNING 3337 printk(KERN_WARNING
3336 "Ignoring invalid numa_zonelist_order value: " 3338 "Ignoring invalid numa_zonelist_order value: "
3337 "%s\n", s); 3339 "%s\n", s);
3338 return -EINVAL; 3340 return -EINVAL;
3339 } 3341 }
3340 return 0; 3342 return 0;
3341 } 3343 }
3342 3344
3343 static __init int setup_numa_zonelist_order(char *s) 3345 static __init int setup_numa_zonelist_order(char *s)
3344 { 3346 {
3345 int ret; 3347 int ret;
3346 3348
3347 if (!s) 3349 if (!s)
3348 return 0; 3350 return 0;
3349 3351
3350 ret = __parse_numa_zonelist_order(s); 3352 ret = __parse_numa_zonelist_order(s);
3351 if (ret == 0) 3353 if (ret == 0)
3352 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3354 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3353 3355
3354 return ret; 3356 return ret;
3355 } 3357 }
3356 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3358 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3357 3359
3358 /* 3360 /*
3359 * sysctl handler for numa_zonelist_order 3361 * sysctl handler for numa_zonelist_order
3360 */ 3362 */
3361 int numa_zonelist_order_handler(ctl_table *table, int write, 3363 int numa_zonelist_order_handler(ctl_table *table, int write,
3362 void __user *buffer, size_t *length, 3364 void __user *buffer, size_t *length,
3363 loff_t *ppos) 3365 loff_t *ppos)
3364 { 3366 {
3365 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3367 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3366 int ret; 3368 int ret;
3367 static DEFINE_MUTEX(zl_order_mutex); 3369 static DEFINE_MUTEX(zl_order_mutex);
3368 3370
3369 mutex_lock(&zl_order_mutex); 3371 mutex_lock(&zl_order_mutex);
3370 if (write) { 3372 if (write) {
3371 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3373 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3372 ret = -EINVAL; 3374 ret = -EINVAL;
3373 goto out; 3375 goto out;
3374 } 3376 }
3375 strcpy(saved_string, (char *)table->data); 3377 strcpy(saved_string, (char *)table->data);
3376 } 3378 }
3377 ret = proc_dostring(table, write, buffer, length, ppos); 3379 ret = proc_dostring(table, write, buffer, length, ppos);
3378 if (ret) 3380 if (ret)
3379 goto out; 3381 goto out;
3380 if (write) { 3382 if (write) {
3381 int oldval = user_zonelist_order; 3383 int oldval = user_zonelist_order;
3382 3384
3383 ret = __parse_numa_zonelist_order((char *)table->data); 3385 ret = __parse_numa_zonelist_order((char *)table->data);
3384 if (ret) { 3386 if (ret) {
3385 /* 3387 /*
3386 * bogus value. restore saved string 3388 * bogus value. restore saved string
3387 */ 3389 */
3388 strncpy((char *)table->data, saved_string, 3390 strncpy((char *)table->data, saved_string,
3389 NUMA_ZONELIST_ORDER_LEN); 3391 NUMA_ZONELIST_ORDER_LEN);
3390 user_zonelist_order = oldval; 3392 user_zonelist_order = oldval;
3391 } else if (oldval != user_zonelist_order) { 3393 } else if (oldval != user_zonelist_order) {
3392 mutex_lock(&zonelists_mutex); 3394 mutex_lock(&zonelists_mutex);
3393 build_all_zonelists(NULL, NULL); 3395 build_all_zonelists(NULL, NULL);
3394 mutex_unlock(&zonelists_mutex); 3396 mutex_unlock(&zonelists_mutex);
3395 } 3397 }
3396 } 3398 }
3397 out: 3399 out:
3398 mutex_unlock(&zl_order_mutex); 3400 mutex_unlock(&zl_order_mutex);
3399 return ret; 3401 return ret;
3400 } 3402 }
3401 3403
3402 3404
3403 #define MAX_NODE_LOAD (nr_online_nodes) 3405 #define MAX_NODE_LOAD (nr_online_nodes)
3404 static int node_load[MAX_NUMNODES]; 3406 static int node_load[MAX_NUMNODES];
3405 3407
3406 /** 3408 /**
3407 * find_next_best_node - find the next node that should appear in a given node's fallback list 3409 * find_next_best_node - find the next node that should appear in a given node's fallback list
3408 * @node: node whose fallback list we're appending 3410 * @node: node whose fallback list we're appending
3409 * @used_node_mask: nodemask_t of already used nodes 3411 * @used_node_mask: nodemask_t of already used nodes
3410 * 3412 *
3411 * We use a number of factors to determine which is the next node that should 3413 * We use a number of factors to determine which is the next node that should
3412 * appear on a given node's fallback list. The node should not have appeared 3414 * appear on a given node's fallback list. The node should not have appeared
3413 * already in @node's fallback list, and it should be the next closest node 3415 * already in @node's fallback list, and it should be the next closest node
3414 * according to the distance array (which contains arbitrary distance values 3416 * according to the distance array (which contains arbitrary distance values
3415 * from each node to each node in the system), and should also prefer nodes 3417 * from each node to each node in the system), and should also prefer nodes
3416 * with no CPUs, since presumably they'll have very little allocation pressure 3418 * with no CPUs, since presumably they'll have very little allocation pressure
3417 * on them otherwise. 3419 * on them otherwise.
3418 * It returns -1 if no node is found. 3420 * It returns -1 if no node is found.
3419 */ 3421 */
3420 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3422 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3421 { 3423 {
3422 int n, val; 3424 int n, val;
3423 int min_val = INT_MAX; 3425 int min_val = INT_MAX;
3424 int best_node = NUMA_NO_NODE; 3426 int best_node = NUMA_NO_NODE;
3425 const struct cpumask *tmp = cpumask_of_node(0); 3427 const struct cpumask *tmp = cpumask_of_node(0);
3426 3428
3427 /* Use the local node if we haven't already */ 3429 /* Use the local node if we haven't already */
3428 if (!node_isset(node, *used_node_mask)) { 3430 if (!node_isset(node, *used_node_mask)) {
3429 node_set(node, *used_node_mask); 3431 node_set(node, *used_node_mask);
3430 return node; 3432 return node;
3431 } 3433 }
3432 3434
3433 for_each_node_state(n, N_MEMORY) { 3435 for_each_node_state(n, N_MEMORY) {
3434 3436
3435 /* Don't want a node to appear more than once */ 3437 /* Don't want a node to appear more than once */
3436 if (node_isset(n, *used_node_mask)) 3438 if (node_isset(n, *used_node_mask))
3437 continue; 3439 continue;
3438 3440
3439 /* Use the distance array to find the distance */ 3441 /* Use the distance array to find the distance */
3440 val = node_distance(node, n); 3442 val = node_distance(node, n);
3441 3443
3442 /* Penalize nodes under us ("prefer the next node") */ 3444 /* Penalize nodes under us ("prefer the next node") */
3443 val += (n < node); 3445 val += (n < node);
3444 3446
3445 /* Give preference to headless and unused nodes */ 3447 /* Give preference to headless and unused nodes */
3446 tmp = cpumask_of_node(n); 3448 tmp = cpumask_of_node(n);
3447 if (!cpumask_empty(tmp)) 3449 if (!cpumask_empty(tmp))
3448 val += PENALTY_FOR_NODE_WITH_CPUS; 3450 val += PENALTY_FOR_NODE_WITH_CPUS;
3449 3451
3450 /* Slight preference for less loaded node */ 3452 /* Slight preference for less loaded node */
3451 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3453 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3452 val += node_load[n]; 3454 val += node_load[n];
3453 3455
3454 if (val < min_val) { 3456 if (val < min_val) {
3455 min_val = val; 3457 min_val = val;
3456 best_node = n; 3458 best_node = n;
3457 } 3459 }
3458 } 3460 }
3459 3461
3460 if (best_node >= 0) 3462 if (best_node >= 0)
3461 node_set(best_node, *used_node_mask); 3463 node_set(best_node, *used_node_mask);
3462 3464
3463 return best_node; 3465 return best_node;
3464 } 3466 }
3465 3467
3466 3468
3467 /* 3469 /*
3468 * Build zonelists ordered by node and zones within node. 3470 * Build zonelists ordered by node and zones within node.
3469 * This results in maximum locality--normal zone overflows into local 3471 * This results in maximum locality--normal zone overflows into local
3470 * DMA zone, if any--but risks exhausting DMA zone. 3472 * DMA zone, if any--but risks exhausting DMA zone.
3471 */ 3473 */
3472 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3474 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3473 { 3475 {
3474 int j; 3476 int j;
3475 struct zonelist *zonelist; 3477 struct zonelist *zonelist;
3476 3478
3477 zonelist = &pgdat->node_zonelists[0]; 3479 zonelist = &pgdat->node_zonelists[0];
3478 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3480 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3479 ; 3481 ;
3480 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3482 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3481 zonelist->_zonerefs[j].zone = NULL; 3483 zonelist->_zonerefs[j].zone = NULL;
3482 zonelist->_zonerefs[j].zone_idx = 0; 3484 zonelist->_zonerefs[j].zone_idx = 0;
3483 } 3485 }
3484 3486
3485 /* 3487 /*
3486 * Build gfp_thisnode zonelists 3488 * Build gfp_thisnode zonelists
3487 */ 3489 */
3488 static void build_thisnode_zonelists(pg_data_t *pgdat) 3490 static void build_thisnode_zonelists(pg_data_t *pgdat)
3489 { 3491 {
3490 int j; 3492 int j;
3491 struct zonelist *zonelist; 3493 struct zonelist *zonelist;
3492 3494
3493 zonelist = &pgdat->node_zonelists[1]; 3495 zonelist = &pgdat->node_zonelists[1];
3494 j = build_zonelists_node(pgdat, zonelist, 0); 3496 j = build_zonelists_node(pgdat, zonelist, 0);
3495 zonelist->_zonerefs[j].zone = NULL; 3497 zonelist->_zonerefs[j].zone = NULL;
3496 zonelist->_zonerefs[j].zone_idx = 0; 3498 zonelist->_zonerefs[j].zone_idx = 0;
3497 } 3499 }
3498 3500
3499 /* 3501 /*
3500 * Build zonelists ordered by zone and nodes within zones. 3502 * Build zonelists ordered by zone and nodes within zones.
3501 * This results in conserving DMA zone[s] until all Normal memory is 3503 * This results in conserving DMA zone[s] until all Normal memory is
3502 * exhausted, but results in overflowing to remote node while memory 3504 * exhausted, but results in overflowing to remote node while memory
3503 * may still exist in local DMA zone. 3505 * may still exist in local DMA zone.
3504 */ 3506 */
3505 static int node_order[MAX_NUMNODES]; 3507 static int node_order[MAX_NUMNODES];
3506 3508
3507 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3509 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3508 { 3510 {
3509 int pos, j, node; 3511 int pos, j, node;
3510 int zone_type; /* needs to be signed */ 3512 int zone_type; /* needs to be signed */
3511 struct zone *z; 3513 struct zone *z;
3512 struct zonelist *zonelist; 3514 struct zonelist *zonelist;
3513 3515
3514 zonelist = &pgdat->node_zonelists[0]; 3516 zonelist = &pgdat->node_zonelists[0];
3515 pos = 0; 3517 pos = 0;
3516 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3518 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3517 for (j = 0; j < nr_nodes; j++) { 3519 for (j = 0; j < nr_nodes; j++) {
3518 node = node_order[j]; 3520 node = node_order[j];
3519 z = &NODE_DATA(node)->node_zones[zone_type]; 3521 z = &NODE_DATA(node)->node_zones[zone_type];
3520 if (populated_zone(z)) { 3522 if (populated_zone(z)) {
3521 zoneref_set_zone(z, 3523 zoneref_set_zone(z,
3522 &zonelist->_zonerefs[pos++]); 3524 &zonelist->_zonerefs[pos++]);
3523 check_highest_zone(zone_type); 3525 check_highest_zone(zone_type);
3524 } 3526 }
3525 } 3527 }
3526 } 3528 }
3527 zonelist->_zonerefs[pos].zone = NULL; 3529 zonelist->_zonerefs[pos].zone = NULL;
3528 zonelist->_zonerefs[pos].zone_idx = 0; 3530 zonelist->_zonerefs[pos].zone_idx = 0;
3529 } 3531 }
3530 3532
3531 static int default_zonelist_order(void) 3533 static int default_zonelist_order(void)
3532 { 3534 {
3533 int nid, zone_type; 3535 int nid, zone_type;
3534 unsigned long low_kmem_size, total_size; 3536 unsigned long low_kmem_size, total_size;
3535 struct zone *z; 3537 struct zone *z;
3536 int average_size; 3538 int average_size;
3537 /* 3539 /*
3538 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3540 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3539 * If they are really small and used heavily, the system can fall 3541 * If they are really small and used heavily, the system can fall
3540 * into OOM very easily. 3542 * into OOM very easily.
3541 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3543 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3542 */ 3544 */
3543 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3545 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3544 low_kmem_size = 0; 3546 low_kmem_size = 0;
3545 total_size = 0; 3547 total_size = 0;
3546 for_each_online_node(nid) { 3548 for_each_online_node(nid) {
3547 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3549 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3548 z = &NODE_DATA(nid)->node_zones[zone_type]; 3550 z = &NODE_DATA(nid)->node_zones[zone_type];
3549 if (populated_zone(z)) { 3551 if (populated_zone(z)) {
3550 if (zone_type < ZONE_NORMAL) 3552 if (zone_type < ZONE_NORMAL)
3551 low_kmem_size += z->managed_pages; 3553 low_kmem_size += z->managed_pages;
3552 total_size += z->managed_pages; 3554 total_size += z->managed_pages;
3553 } else if (zone_type == ZONE_NORMAL) { 3555 } else if (zone_type == ZONE_NORMAL) {
3554 /* 3556 /*
3555 * If any node has only lowmem, then node order 3557 * If any node has only lowmem, then node order
3556 * is preferred to allow kernel allocations 3558 * is preferred to allow kernel allocations
3557 * locally; otherwise, they can easily infringe 3559 * locally; otherwise, they can easily infringe
3558 * on other nodes when there is an abundance of 3560 * on other nodes when there is an abundance of
3559 * lowmem available to allocate from. 3561 * lowmem available to allocate from.
3560 */ 3562 */
3561 return ZONELIST_ORDER_NODE; 3563 return ZONELIST_ORDER_NODE;
3562 } 3564 }
3563 } 3565 }
3564 } 3566 }
3565 if (!low_kmem_size || /* there are no DMA area. */ 3567 if (!low_kmem_size || /* there are no DMA area. */
3566 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3568 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3567 return ZONELIST_ORDER_NODE; 3569 return ZONELIST_ORDER_NODE;
3568 /* 3570 /*
3569 * look into each node's config. 3571 * look into each node's config.
3570 * If there is a node whose DMA/DMA32 memory is very big area on 3572 * If there is a node whose DMA/DMA32 memory is very big area on
3571 * local memory, NODE_ORDER may be suitable. 3573 * local memory, NODE_ORDER may be suitable.
3572 */ 3574 */
3573 average_size = total_size / 3575 average_size = total_size /
3574 (nodes_weight(node_states[N_MEMORY]) + 1); 3576 (nodes_weight(node_states[N_MEMORY]) + 1);
3575 for_each_online_node(nid) { 3577 for_each_online_node(nid) {
3576 low_kmem_size = 0; 3578 low_kmem_size = 0;
3577 total_size = 0; 3579 total_size = 0;
3578 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3580 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3579 z = &NODE_DATA(nid)->node_zones[zone_type]; 3581 z = &NODE_DATA(nid)->node_zones[zone_type];
3580 if (populated_zone(z)) { 3582 if (populated_zone(z)) {
3581 if (zone_type < ZONE_NORMAL) 3583 if (zone_type < ZONE_NORMAL)
3582 low_kmem_size += z->present_pages; 3584 low_kmem_size += z->present_pages;
3583 total_size += z->present_pages; 3585 total_size += z->present_pages;
3584 } 3586 }
3585 } 3587 }
3586 if (low_kmem_size && 3588 if (low_kmem_size &&
3587 total_size > average_size && /* ignore small node */ 3589 total_size > average_size && /* ignore small node */
3588 low_kmem_size > total_size * 70/100) 3590 low_kmem_size > total_size * 70/100)
3589 return ZONELIST_ORDER_NODE; 3591 return ZONELIST_ORDER_NODE;
3590 } 3592 }
3591 return ZONELIST_ORDER_ZONE; 3593 return ZONELIST_ORDER_ZONE;
3592 } 3594 }
3593 3595
3594 static void set_zonelist_order(void) 3596 static void set_zonelist_order(void)
3595 { 3597 {
3596 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3598 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3597 current_zonelist_order = default_zonelist_order(); 3599 current_zonelist_order = default_zonelist_order();
3598 else 3600 else
3599 current_zonelist_order = user_zonelist_order; 3601 current_zonelist_order = user_zonelist_order;
3600 } 3602 }
3601 3603
3602 static void build_zonelists(pg_data_t *pgdat) 3604 static void build_zonelists(pg_data_t *pgdat)
3603 { 3605 {
3604 int j, node, load; 3606 int j, node, load;
3605 enum zone_type i; 3607 enum zone_type i;
3606 nodemask_t used_mask; 3608 nodemask_t used_mask;
3607 int local_node, prev_node; 3609 int local_node, prev_node;
3608 struct zonelist *zonelist; 3610 struct zonelist *zonelist;
3609 int order = current_zonelist_order; 3611 int order = current_zonelist_order;
3610 3612
3611 /* initialize zonelists */ 3613 /* initialize zonelists */
3612 for (i = 0; i < MAX_ZONELISTS; i++) { 3614 for (i = 0; i < MAX_ZONELISTS; i++) {
3613 zonelist = pgdat->node_zonelists + i; 3615 zonelist = pgdat->node_zonelists + i;
3614 zonelist->_zonerefs[0].zone = NULL; 3616 zonelist->_zonerefs[0].zone = NULL;
3615 zonelist->_zonerefs[0].zone_idx = 0; 3617 zonelist->_zonerefs[0].zone_idx = 0;
3616 } 3618 }
3617 3619
3618 /* NUMA-aware ordering of nodes */ 3620 /* NUMA-aware ordering of nodes */
3619 local_node = pgdat->node_id; 3621 local_node = pgdat->node_id;
3620 load = nr_online_nodes; 3622 load = nr_online_nodes;
3621 prev_node = local_node; 3623 prev_node = local_node;
3622 nodes_clear(used_mask); 3624 nodes_clear(used_mask);
3623 3625
3624 memset(node_order, 0, sizeof(node_order)); 3626 memset(node_order, 0, sizeof(node_order));
3625 j = 0; 3627 j = 0;
3626 3628
3627 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3629 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3628 /* 3630 /*
3629 * We don't want to pressure a particular node. 3631 * We don't want to pressure a particular node.
3630 * So adding penalty to the first node in same 3632 * So adding penalty to the first node in same
3631 * distance group to make it round-robin. 3633 * distance group to make it round-robin.
3632 */ 3634 */
3633 if (node_distance(local_node, node) != 3635 if (node_distance(local_node, node) !=
3634 node_distance(local_node, prev_node)) 3636 node_distance(local_node, prev_node))
3635 node_load[node] = load; 3637 node_load[node] = load;
3636 3638
3637 prev_node = node; 3639 prev_node = node;
3638 load--; 3640 load--;
3639 if (order == ZONELIST_ORDER_NODE) 3641 if (order == ZONELIST_ORDER_NODE)
3640 build_zonelists_in_node_order(pgdat, node); 3642 build_zonelists_in_node_order(pgdat, node);
3641 else 3643 else
3642 node_order[j++] = node; /* remember order */ 3644 node_order[j++] = node; /* remember order */
3643 } 3645 }
3644 3646
3645 if (order == ZONELIST_ORDER_ZONE) { 3647 if (order == ZONELIST_ORDER_ZONE) {
3646 /* calculate node order -- i.e., DMA last! */ 3648 /* calculate node order -- i.e., DMA last! */
3647 build_zonelists_in_zone_order(pgdat, j); 3649 build_zonelists_in_zone_order(pgdat, j);
3648 } 3650 }
3649 3651
3650 build_thisnode_zonelists(pgdat); 3652 build_thisnode_zonelists(pgdat);
3651 } 3653 }
3652 3654
3653 /* Construct the zonelist performance cache - see further mmzone.h */ 3655 /* Construct the zonelist performance cache - see further mmzone.h */
3654 static void build_zonelist_cache(pg_data_t *pgdat) 3656 static void build_zonelist_cache(pg_data_t *pgdat)
3655 { 3657 {
3656 struct zonelist *zonelist; 3658 struct zonelist *zonelist;
3657 struct zonelist_cache *zlc; 3659 struct zonelist_cache *zlc;
3658 struct zoneref *z; 3660 struct zoneref *z;
3659 3661
3660 zonelist = &pgdat->node_zonelists[0]; 3662 zonelist = &pgdat->node_zonelists[0];
3661 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3663 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3662 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3664 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3663 for (z = zonelist->_zonerefs; z->zone; z++) 3665 for (z = zonelist->_zonerefs; z->zone; z++)
3664 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3666 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3665 } 3667 }
3666 3668
3667 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3669 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3668 /* 3670 /*
3669 * Return node id of node used for "local" allocations. 3671 * Return node id of node used for "local" allocations.
3670 * I.e., first node id of first zone in arg node's generic zonelist. 3672 * I.e., first node id of first zone in arg node's generic zonelist.
3671 * Used for initializing percpu 'numa_mem', which is used primarily 3673 * Used for initializing percpu 'numa_mem', which is used primarily
3672 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3674 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3673 */ 3675 */
3674 int local_memory_node(int node) 3676 int local_memory_node(int node)
3675 { 3677 {
3676 struct zone *zone; 3678 struct zone *zone;
3677 3679
3678 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3680 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3679 gfp_zone(GFP_KERNEL), 3681 gfp_zone(GFP_KERNEL),
3680 NULL, 3682 NULL,
3681 &zone); 3683 &zone);
3682 return zone->node; 3684 return zone->node;
3683 } 3685 }
3684 #endif 3686 #endif
3685 3687
3686 #else /* CONFIG_NUMA */ 3688 #else /* CONFIG_NUMA */
3687 3689
3688 static void set_zonelist_order(void) 3690 static void set_zonelist_order(void)
3689 { 3691 {
3690 current_zonelist_order = ZONELIST_ORDER_ZONE; 3692 current_zonelist_order = ZONELIST_ORDER_ZONE;
3691 } 3693 }
3692 3694
3693 static void build_zonelists(pg_data_t *pgdat) 3695 static void build_zonelists(pg_data_t *pgdat)
3694 { 3696 {
3695 int node, local_node; 3697 int node, local_node;
3696 enum zone_type j; 3698 enum zone_type j;
3697 struct zonelist *zonelist; 3699 struct zonelist *zonelist;
3698 3700
3699 local_node = pgdat->node_id; 3701 local_node = pgdat->node_id;
3700 3702
3701 zonelist = &pgdat->node_zonelists[0]; 3703 zonelist = &pgdat->node_zonelists[0];
3702 j = build_zonelists_node(pgdat, zonelist, 0); 3704 j = build_zonelists_node(pgdat, zonelist, 0);
3703 3705
3704 /* 3706 /*
3705 * Now we build the zonelist so that it contains the zones 3707 * Now we build the zonelist so that it contains the zones
3706 * of all the other nodes. 3708 * of all the other nodes.
3707 * We don't want to pressure a particular node, so when 3709 * We don't want to pressure a particular node, so when
3708 * building the zones for node N, we make sure that the 3710 * building the zones for node N, we make sure that the
3709 * zones coming right after the local ones are those from 3711 * zones coming right after the local ones are those from
3710 * node N+1 (modulo N) 3712 * node N+1 (modulo N)
3711 */ 3713 */
3712 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3714 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3713 if (!node_online(node)) 3715 if (!node_online(node))
3714 continue; 3716 continue;
3715 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3717 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3716 } 3718 }
3717 for (node = 0; node < local_node; node++) { 3719 for (node = 0; node < local_node; node++) {
3718 if (!node_online(node)) 3720 if (!node_online(node))
3719 continue; 3721 continue;
3720 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3722 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3721 } 3723 }
3722 3724
3723 zonelist->_zonerefs[j].zone = NULL; 3725 zonelist->_zonerefs[j].zone = NULL;
3724 zonelist->_zonerefs[j].zone_idx = 0; 3726 zonelist->_zonerefs[j].zone_idx = 0;
3725 } 3727 }
3726 3728
3727 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3729 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3728 static void build_zonelist_cache(pg_data_t *pgdat) 3730 static void build_zonelist_cache(pg_data_t *pgdat)
3729 { 3731 {
3730 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3732 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3731 } 3733 }
3732 3734
3733 #endif /* CONFIG_NUMA */ 3735 #endif /* CONFIG_NUMA */
3734 3736
3735 /* 3737 /*
3736 * Boot pageset table. One per cpu which is going to be used for all 3738 * Boot pageset table. One per cpu which is going to be used for all
3737 * zones and all nodes. The parameters will be set in such a way 3739 * zones and all nodes. The parameters will be set in such a way
3738 * that an item put on a list will immediately be handed over to 3740 * that an item put on a list will immediately be handed over to
3739 * the buddy list. This is safe since pageset manipulation is done 3741 * the buddy list. This is safe since pageset manipulation is done
3740 * with interrupts disabled. 3742 * with interrupts disabled.
3741 * 3743 *
3742 * The boot_pagesets must be kept even after bootup is complete for 3744 * The boot_pagesets must be kept even after bootup is complete for
3743 * unused processors and/or zones. They do play a role for bootstrapping 3745 * unused processors and/or zones. They do play a role for bootstrapping
3744 * hotplugged processors. 3746 * hotplugged processors.
3745 * 3747 *
3746 * zoneinfo_show() and maybe other functions do 3748 * zoneinfo_show() and maybe other functions do
3747 * not check if the processor is online before following the pageset pointer. 3749 * not check if the processor is online before following the pageset pointer.
3748 * Other parts of the kernel may not check if the zone is available. 3750 * Other parts of the kernel may not check if the zone is available.
3749 */ 3751 */
3750 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3752 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3751 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3753 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3752 static void setup_zone_pageset(struct zone *zone); 3754 static void setup_zone_pageset(struct zone *zone);
3753 3755
3754 /* 3756 /*
3755 * Global mutex to protect against size modification of zonelists 3757 * Global mutex to protect against size modification of zonelists
3756 * as well as to serialize pageset setup for the new populated zone. 3758 * as well as to serialize pageset setup for the new populated zone.
3757 */ 3759 */
3758 DEFINE_MUTEX(zonelists_mutex); 3760 DEFINE_MUTEX(zonelists_mutex);
3759 3761
3760 /* return values int ....just for stop_machine() */ 3762 /* return values int ....just for stop_machine() */
3761 static int __build_all_zonelists(void *data) 3763 static int __build_all_zonelists(void *data)
3762 { 3764 {
3763 int nid; 3765 int nid;
3764 int cpu; 3766 int cpu;
3765 pg_data_t *self = data; 3767 pg_data_t *self = data;
3766 3768
3767 #ifdef CONFIG_NUMA 3769 #ifdef CONFIG_NUMA
3768 memset(node_load, 0, sizeof(node_load)); 3770 memset(node_load, 0, sizeof(node_load));
3769 #endif 3771 #endif
3770 3772
3771 if (self && !node_online(self->node_id)) { 3773 if (self && !node_online(self->node_id)) {
3772 build_zonelists(self); 3774 build_zonelists(self);
3773 build_zonelist_cache(self); 3775 build_zonelist_cache(self);
3774 } 3776 }
3775 3777
3776 for_each_online_node(nid) { 3778 for_each_online_node(nid) {
3777 pg_data_t *pgdat = NODE_DATA(nid); 3779 pg_data_t *pgdat = NODE_DATA(nid);
3778 3780
3779 build_zonelists(pgdat); 3781 build_zonelists(pgdat);
3780 build_zonelist_cache(pgdat); 3782 build_zonelist_cache(pgdat);
3781 } 3783 }
3782 3784
3783 /* 3785 /*
3784 * Initialize the boot_pagesets that are going to be used 3786 * Initialize the boot_pagesets that are going to be used
3785 * for bootstrapping processors. The real pagesets for 3787 * for bootstrapping processors. The real pagesets for
3786 * each zone will be allocated later when the per cpu 3788 * each zone will be allocated later when the per cpu
3787 * allocator is available. 3789 * allocator is available.
3788 * 3790 *
3789 * boot_pagesets are used also for bootstrapping offline 3791 * boot_pagesets are used also for bootstrapping offline
3790 * cpus if the system is already booted because the pagesets 3792 * cpus if the system is already booted because the pagesets
3791 * are needed to initialize allocators on a specific cpu too. 3793 * are needed to initialize allocators on a specific cpu too.
3792 * F.e. the percpu allocator needs the page allocator which 3794 * F.e. the percpu allocator needs the page allocator which
3793 * needs the percpu allocator in order to allocate its pagesets 3795 * needs the percpu allocator in order to allocate its pagesets
3794 * (a chicken-egg dilemma). 3796 * (a chicken-egg dilemma).
3795 */ 3797 */
3796 for_each_possible_cpu(cpu) { 3798 for_each_possible_cpu(cpu) {
3797 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3799 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3798 3800
3799 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3801 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3800 /* 3802 /*
3801 * We now know the "local memory node" for each node-- 3803 * We now know the "local memory node" for each node--
3802 * i.e., the node of the first zone in the generic zonelist. 3804 * i.e., the node of the first zone in the generic zonelist.
3803 * Set up numa_mem percpu variable for on-line cpus. During 3805 * Set up numa_mem percpu variable for on-line cpus. During
3804 * boot, only the boot cpu should be on-line; we'll init the 3806 * boot, only the boot cpu should be on-line; we'll init the
3805 * secondary cpus' numa_mem as they come on-line. During 3807 * secondary cpus' numa_mem as they come on-line. During
3806 * node/memory hotplug, we'll fixup all on-line cpus. 3808 * node/memory hotplug, we'll fixup all on-line cpus.
3807 */ 3809 */
3808 if (cpu_online(cpu)) 3810 if (cpu_online(cpu))
3809 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3811 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3810 #endif 3812 #endif
3811 } 3813 }
3812 3814
3813 return 0; 3815 return 0;
3814 } 3816 }
3815 3817
3816 /* 3818 /*
3817 * Called with zonelists_mutex held always 3819 * Called with zonelists_mutex held always
3818 * unless system_state == SYSTEM_BOOTING. 3820 * unless system_state == SYSTEM_BOOTING.
3819 */ 3821 */
3820 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3822 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3821 { 3823 {
3822 set_zonelist_order(); 3824 set_zonelist_order();
3823 3825
3824 if (system_state == SYSTEM_BOOTING) { 3826 if (system_state == SYSTEM_BOOTING) {
3825 __build_all_zonelists(NULL); 3827 __build_all_zonelists(NULL);
3826 mminit_verify_zonelist(); 3828 mminit_verify_zonelist();
3827 cpuset_init_current_mems_allowed(); 3829 cpuset_init_current_mems_allowed();
3828 } else { 3830 } else {
3829 #ifdef CONFIG_MEMORY_HOTPLUG 3831 #ifdef CONFIG_MEMORY_HOTPLUG
3830 if (zone) 3832 if (zone)
3831 setup_zone_pageset(zone); 3833 setup_zone_pageset(zone);
3832 #endif 3834 #endif
3833 /* we have to stop all cpus to guarantee there is no user 3835 /* we have to stop all cpus to guarantee there is no user
3834 of zonelist */ 3836 of zonelist */
3835 stop_machine(__build_all_zonelists, pgdat, NULL); 3837 stop_machine(__build_all_zonelists, pgdat, NULL);
3836 /* cpuset refresh routine should be here */ 3838 /* cpuset refresh routine should be here */
3837 } 3839 }
3838 vm_total_pages = nr_free_pagecache_pages(); 3840 vm_total_pages = nr_free_pagecache_pages();
3839 /* 3841 /*
3840 * Disable grouping by mobility if the number of pages in the 3842 * Disable grouping by mobility if the number of pages in the
3841 * system is too low to allow the mechanism to work. It would be 3843 * system is too low to allow the mechanism to work. It would be
3842 * more accurate, but expensive to check per-zone. This check is 3844 * more accurate, but expensive to check per-zone. This check is
3843 * made on memory-hotadd so a system can start with mobility 3845 * made on memory-hotadd so a system can start with mobility
3844 * disabled and enable it later 3846 * disabled and enable it later
3845 */ 3847 */
3846 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3848 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3847 page_group_by_mobility_disabled = 1; 3849 page_group_by_mobility_disabled = 1;
3848 else 3850 else
3849 page_group_by_mobility_disabled = 0; 3851 page_group_by_mobility_disabled = 0;
3850 3852
3851 printk("Built %i zonelists in %s order, mobility grouping %s. " 3853 printk("Built %i zonelists in %s order, mobility grouping %s. "
3852 "Total pages: %ld\n", 3854 "Total pages: %ld\n",
3853 nr_online_nodes, 3855 nr_online_nodes,
3854 zonelist_order_name[current_zonelist_order], 3856 zonelist_order_name[current_zonelist_order],
3855 page_group_by_mobility_disabled ? "off" : "on", 3857 page_group_by_mobility_disabled ? "off" : "on",
3856 vm_total_pages); 3858 vm_total_pages);
3857 #ifdef CONFIG_NUMA 3859 #ifdef CONFIG_NUMA
3858 printk("Policy zone: %s\n", zone_names[policy_zone]); 3860 printk("Policy zone: %s\n", zone_names[policy_zone]);
3859 #endif 3861 #endif
3860 } 3862 }
3861 3863
3862 /* 3864 /*
3863 * Helper functions to size the waitqueue hash table. 3865 * Helper functions to size the waitqueue hash table.
3864 * Essentially these want to choose hash table sizes sufficiently 3866 * Essentially these want to choose hash table sizes sufficiently
3865 * large so that collisions trying to wait on pages are rare. 3867 * large so that collisions trying to wait on pages are rare.
3866 * But in fact, the number of active page waitqueues on typical 3868 * But in fact, the number of active page waitqueues on typical
3867 * systems is ridiculously low, less than 200. So this is even 3869 * systems is ridiculously low, less than 200. So this is even
3868 * conservative, even though it seems large. 3870 * conservative, even though it seems large.
3869 * 3871 *
3870 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3872 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3871 * waitqueues, i.e. the size of the waitq table given the number of pages. 3873 * waitqueues, i.e. the size of the waitq table given the number of pages.
3872 */ 3874 */
3873 #define PAGES_PER_WAITQUEUE 256 3875 #define PAGES_PER_WAITQUEUE 256
3874 3876
3875 #ifndef CONFIG_MEMORY_HOTPLUG 3877 #ifndef CONFIG_MEMORY_HOTPLUG
3876 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3878 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3877 { 3879 {
3878 unsigned long size = 1; 3880 unsigned long size = 1;
3879 3881
3880 pages /= PAGES_PER_WAITQUEUE; 3882 pages /= PAGES_PER_WAITQUEUE;
3881 3883
3882 while (size < pages) 3884 while (size < pages)
3883 size <<= 1; 3885 size <<= 1;
3884 3886
3885 /* 3887 /*
3886 * Once we have dozens or even hundreds of threads sleeping 3888 * Once we have dozens or even hundreds of threads sleeping
3887 * on IO we've got bigger problems than wait queue collision. 3889 * on IO we've got bigger problems than wait queue collision.
3888 * Limit the size of the wait table to a reasonable size. 3890 * Limit the size of the wait table to a reasonable size.
3889 */ 3891 */
3890 size = min(size, 4096UL); 3892 size = min(size, 4096UL);
3891 3893
3892 return max(size, 4UL); 3894 return max(size, 4UL);
3893 } 3895 }
3894 #else 3896 #else
3895 /* 3897 /*
3896 * A zone's size might be changed by hot-add, so it is not possible to determine 3898 * A zone's size might be changed by hot-add, so it is not possible to determine
3897 * a suitable size for its wait_table. So we use the maximum size now. 3899 * a suitable size for its wait_table. So we use the maximum size now.
3898 * 3900 *
3899 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3901 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3900 * 3902 *
3901 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3903 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3902 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3904 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3903 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3905 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3904 * 3906 *
3905 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3907 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3906 * or more by the traditional way. (See above). It equals: 3908 * or more by the traditional way. (See above). It equals:
3907 * 3909 *
3908 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3910 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3909 * ia64(16K page size) : = ( 8G + 4M)byte. 3911 * ia64(16K page size) : = ( 8G + 4M)byte.
3910 * powerpc (64K page size) : = (32G +16M)byte. 3912 * powerpc (64K page size) : = (32G +16M)byte.
3911 */ 3913 */
3912 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3914 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3913 { 3915 {
3914 return 4096UL; 3916 return 4096UL;
3915 } 3917 }
3916 #endif 3918 #endif
3917 3919
3918 /* 3920 /*
3919 * This is an integer logarithm so that shifts can be used later 3921 * This is an integer logarithm so that shifts can be used later
3920 * to extract the more random high bits from the multiplicative 3922 * to extract the more random high bits from the multiplicative
3921 * hash function before the remainder is taken. 3923 * hash function before the remainder is taken.
3922 */ 3924 */
3923 static inline unsigned long wait_table_bits(unsigned long size) 3925 static inline unsigned long wait_table_bits(unsigned long size)
3924 { 3926 {
3925 return ffz(~size); 3927 return ffz(~size);
3926 } 3928 }
3927 3929
3928 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3930 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3929 3931
3930 /* 3932 /*
3931 * Check if a pageblock contains reserved pages 3933 * Check if a pageblock contains reserved pages
3932 */ 3934 */
3933 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3935 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3934 { 3936 {
3935 unsigned long pfn; 3937 unsigned long pfn;
3936 3938
3937 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3939 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3938 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3940 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3939 return 1; 3941 return 1;
3940 } 3942 }
3941 return 0; 3943 return 0;
3942 } 3944 }
3943 3945
3944 /* 3946 /*
3945 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3947 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3946 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3948 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3947 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3949 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3948 * higher will lead to a bigger reserve which will get freed as contiguous 3950 * higher will lead to a bigger reserve which will get freed as contiguous
3949 * blocks as reclaim kicks in 3951 * blocks as reclaim kicks in
3950 */ 3952 */
3951 static void setup_zone_migrate_reserve(struct zone *zone) 3953 static void setup_zone_migrate_reserve(struct zone *zone)
3952 { 3954 {
3953 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3955 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3954 struct page *page; 3956 struct page *page;
3955 unsigned long block_migratetype; 3957 unsigned long block_migratetype;
3956 int reserve; 3958 int reserve;
3957 int old_reserve; 3959 int old_reserve;
3958 3960
3959 /* 3961 /*
3960 * Get the start pfn, end pfn and the number of blocks to reserve 3962 * Get the start pfn, end pfn and the number of blocks to reserve
3961 * We have to be careful to be aligned to pageblock_nr_pages to 3963 * We have to be careful to be aligned to pageblock_nr_pages to
3962 * make sure that we always check pfn_valid for the first page in 3964 * make sure that we always check pfn_valid for the first page in
3963 * the block. 3965 * the block.
3964 */ 3966 */
3965 start_pfn = zone->zone_start_pfn; 3967 start_pfn = zone->zone_start_pfn;
3966 end_pfn = zone_end_pfn(zone); 3968 end_pfn = zone_end_pfn(zone);
3967 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3969 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3968 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3970 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3969 pageblock_order; 3971 pageblock_order;
3970 3972
3971 /* 3973 /*
3972 * Reserve blocks are generally in place to help high-order atomic 3974 * Reserve blocks are generally in place to help high-order atomic
3973 * allocations that are short-lived. A min_free_kbytes value that 3975 * allocations that are short-lived. A min_free_kbytes value that
3974 * would result in more than 2 reserve blocks for atomic allocations 3976 * would result in more than 2 reserve blocks for atomic allocations
3975 * is assumed to be in place to help anti-fragmentation for the 3977 * is assumed to be in place to help anti-fragmentation for the
3976 * future allocation of hugepages at runtime. 3978 * future allocation of hugepages at runtime.
3977 */ 3979 */
3978 reserve = min(2, reserve); 3980 reserve = min(2, reserve);
3979 old_reserve = zone->nr_migrate_reserve_block; 3981 old_reserve = zone->nr_migrate_reserve_block;
3980 3982
3981 /* When memory hot-add, we almost always need to do nothing */ 3983 /* When memory hot-add, we almost always need to do nothing */
3982 if (reserve == old_reserve) 3984 if (reserve == old_reserve)
3983 return; 3985 return;
3984 zone->nr_migrate_reserve_block = reserve; 3986 zone->nr_migrate_reserve_block = reserve;
3985 3987
3986 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3988 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3987 if (!pfn_valid(pfn)) 3989 if (!pfn_valid(pfn))
3988 continue; 3990 continue;
3989 page = pfn_to_page(pfn); 3991 page = pfn_to_page(pfn);
3990 3992
3991 /* Watch out for overlapping nodes */ 3993 /* Watch out for overlapping nodes */
3992 if (page_to_nid(page) != zone_to_nid(zone)) 3994 if (page_to_nid(page) != zone_to_nid(zone))
3993 continue; 3995 continue;
3994 3996
3995 block_migratetype = get_pageblock_migratetype(page); 3997 block_migratetype = get_pageblock_migratetype(page);
3996 3998
3997 /* Only test what is necessary when the reserves are not met */ 3999 /* Only test what is necessary when the reserves are not met */
3998 if (reserve > 0) { 4000 if (reserve > 0) {
3999 /* 4001 /*
4000 * Blocks with reserved pages will never free, skip 4002 * Blocks with reserved pages will never free, skip
4001 * them. 4003 * them.
4002 */ 4004 */
4003 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 4005 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
4004 if (pageblock_is_reserved(pfn, block_end_pfn)) 4006 if (pageblock_is_reserved(pfn, block_end_pfn))
4005 continue; 4007 continue;
4006 4008
4007 /* If this block is reserved, account for it */ 4009 /* If this block is reserved, account for it */
4008 if (block_migratetype == MIGRATE_RESERVE) { 4010 if (block_migratetype == MIGRATE_RESERVE) {
4009 reserve--; 4011 reserve--;
4010 continue; 4012 continue;
4011 } 4013 }
4012 4014
4013 /* Suitable for reserving if this block is movable */ 4015 /* Suitable for reserving if this block is movable */
4014 if (block_migratetype == MIGRATE_MOVABLE) { 4016 if (block_migratetype == MIGRATE_MOVABLE) {
4015 set_pageblock_migratetype(page, 4017 set_pageblock_migratetype(page,
4016 MIGRATE_RESERVE); 4018 MIGRATE_RESERVE);
4017 move_freepages_block(zone, page, 4019 move_freepages_block(zone, page,
4018 MIGRATE_RESERVE); 4020 MIGRATE_RESERVE);
4019 reserve--; 4021 reserve--;
4020 continue; 4022 continue;
4021 } 4023 }
4022 } else if (!old_reserve) { 4024 } else if (!old_reserve) {
4023 /* 4025 /*
4024 * At boot time we don't need to scan the whole zone 4026 * At boot time we don't need to scan the whole zone
4025 * for turning off MIGRATE_RESERVE. 4027 * for turning off MIGRATE_RESERVE.
4026 */ 4028 */
4027 break; 4029 break;
4028 } 4030 }
4029 4031
4030 /* 4032 /*
4031 * If the reserve is met and this is a previous reserved block, 4033 * If the reserve is met and this is a previous reserved block,
4032 * take it back 4034 * take it back
4033 */ 4035 */
4034 if (block_migratetype == MIGRATE_RESERVE) { 4036 if (block_migratetype == MIGRATE_RESERVE) {
4035 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4037 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4036 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4038 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4037 } 4039 }
4038 } 4040 }
4039 } 4041 }
4040 4042
4041 /* 4043 /*
4042 * Initially all pages are reserved - free ones are freed 4044 * Initially all pages are reserved - free ones are freed
4043 * up by free_all_bootmem() once the early boot process is 4045 * up by free_all_bootmem() once the early boot process is
4044 * done. Non-atomic initialization, single-pass. 4046 * done. Non-atomic initialization, single-pass.
4045 */ 4047 */
4046 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4048 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4047 unsigned long start_pfn, enum memmap_context context) 4049 unsigned long start_pfn, enum memmap_context context)
4048 { 4050 {
4049 struct page *page; 4051 struct page *page;
4050 unsigned long end_pfn = start_pfn + size; 4052 unsigned long end_pfn = start_pfn + size;
4051 unsigned long pfn; 4053 unsigned long pfn;
4052 struct zone *z; 4054 struct zone *z;
4053 4055
4054 if (highest_memmap_pfn < end_pfn - 1) 4056 if (highest_memmap_pfn < end_pfn - 1)
4055 highest_memmap_pfn = end_pfn - 1; 4057 highest_memmap_pfn = end_pfn - 1;
4056 4058
4057 z = &NODE_DATA(nid)->node_zones[zone]; 4059 z = &NODE_DATA(nid)->node_zones[zone];
4058 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4060 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4059 /* 4061 /*
4060 * There can be holes in boot-time mem_map[]s 4062 * There can be holes in boot-time mem_map[]s
4061 * handed to this function. They do not 4063 * handed to this function. They do not
4062 * exist on hotplugged memory. 4064 * exist on hotplugged memory.
4063 */ 4065 */
4064 if (context == MEMMAP_EARLY) { 4066 if (context == MEMMAP_EARLY) {
4065 if (!early_pfn_valid(pfn)) 4067 if (!early_pfn_valid(pfn))
4066 continue; 4068 continue;
4067 if (!early_pfn_in_nid(pfn, nid)) 4069 if (!early_pfn_in_nid(pfn, nid))
4068 continue; 4070 continue;
4069 } 4071 }
4070 page = pfn_to_page(pfn); 4072 page = pfn_to_page(pfn);
4071 set_page_links(page, zone, nid, pfn); 4073 set_page_links(page, zone, nid, pfn);
4072 mminit_verify_page_links(page, zone, nid, pfn); 4074 mminit_verify_page_links(page, zone, nid, pfn);
4073 init_page_count(page); 4075 init_page_count(page);
4074 page_mapcount_reset(page); 4076 page_mapcount_reset(page);
4075 page_nid_reset_last(page); 4077 page_nid_reset_last(page);
4076 SetPageReserved(page); 4078 SetPageReserved(page);
4077 /* 4079 /*
4078 * Mark the block movable so that blocks are reserved for 4080 * Mark the block movable so that blocks are reserved for
4079 * movable at startup. This will force kernel allocations 4081 * movable at startup. This will force kernel allocations
4080 * to reserve their blocks rather than leaking throughout 4082 * to reserve their blocks rather than leaking throughout
4081 * the address space during boot when many long-lived 4083 * the address space during boot when many long-lived
4082 * kernel allocations are made. Later some blocks near 4084 * kernel allocations are made. Later some blocks near
4083 * the start are marked MIGRATE_RESERVE by 4085 * the start are marked MIGRATE_RESERVE by
4084 * setup_zone_migrate_reserve() 4086 * setup_zone_migrate_reserve()
4085 * 4087 *
4086 * bitmap is created for zone's valid pfn range. but memmap 4088 * bitmap is created for zone's valid pfn range. but memmap
4087 * can be created for invalid pages (for alignment) 4089 * can be created for invalid pages (for alignment)
4088 * check here not to call set_pageblock_migratetype() against 4090 * check here not to call set_pageblock_migratetype() against
4089 * pfn out of zone. 4091 * pfn out of zone.
4090 */ 4092 */
4091 if ((z->zone_start_pfn <= pfn) 4093 if ((z->zone_start_pfn <= pfn)
4092 && (pfn < zone_end_pfn(z)) 4094 && (pfn < zone_end_pfn(z))
4093 && !(pfn & (pageblock_nr_pages - 1))) 4095 && !(pfn & (pageblock_nr_pages - 1)))
4094 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4096 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4095 4097
4096 INIT_LIST_HEAD(&page->lru); 4098 INIT_LIST_HEAD(&page->lru);
4097 #ifdef WANT_PAGE_VIRTUAL 4099 #ifdef WANT_PAGE_VIRTUAL
4098 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4100 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4099 if (!is_highmem_idx(zone)) 4101 if (!is_highmem_idx(zone))
4100 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4102 set_page_address(page, __va(pfn << PAGE_SHIFT));
4101 #endif 4103 #endif
4102 } 4104 }
4103 } 4105 }
4104 4106
4105 static void __meminit zone_init_free_lists(struct zone *zone) 4107 static void __meminit zone_init_free_lists(struct zone *zone)
4106 { 4108 {
4107 int order, t; 4109 int order, t;
4108 for_each_migratetype_order(order, t) { 4110 for_each_migratetype_order(order, t) {
4109 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4111 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4110 zone->free_area[order].nr_free = 0; 4112 zone->free_area[order].nr_free = 0;
4111 } 4113 }
4112 } 4114 }
4113 4115
4114 #ifndef __HAVE_ARCH_MEMMAP_INIT 4116 #ifndef __HAVE_ARCH_MEMMAP_INIT
4115 #define memmap_init(size, nid, zone, start_pfn) \ 4117 #define memmap_init(size, nid, zone, start_pfn) \
4116 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4118 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4117 #endif 4119 #endif
4118 4120
4119 static int zone_batchsize(struct zone *zone) 4121 static int zone_batchsize(struct zone *zone)
4120 { 4122 {
4121 #ifdef CONFIG_MMU 4123 #ifdef CONFIG_MMU
4122 int batch; 4124 int batch;
4123 4125
4124 /* 4126 /*
4125 * The per-cpu-pages pools are set to around 1000th of the 4127 * The per-cpu-pages pools are set to around 1000th of the
4126 * size of the zone. But no more than 1/2 of a meg. 4128 * size of the zone. But no more than 1/2 of a meg.
4127 * 4129 *
4128 * OK, so we don't know how big the cache is. So guess. 4130 * OK, so we don't know how big the cache is. So guess.
4129 */ 4131 */
4130 batch = zone->managed_pages / 1024; 4132 batch = zone->managed_pages / 1024;
4131 if (batch * PAGE_SIZE > 512 * 1024) 4133 if (batch * PAGE_SIZE > 512 * 1024)
4132 batch = (512 * 1024) / PAGE_SIZE; 4134 batch = (512 * 1024) / PAGE_SIZE;
4133 batch /= 4; /* We effectively *= 4 below */ 4135 batch /= 4; /* We effectively *= 4 below */
4134 if (batch < 1) 4136 if (batch < 1)
4135 batch = 1; 4137 batch = 1;
4136 4138
4137 /* 4139 /*
4138 * Clamp the batch to a 2^n - 1 value. Having a power 4140 * Clamp the batch to a 2^n - 1 value. Having a power
4139 * of 2 value was found to be more likely to have 4141 * of 2 value was found to be more likely to have
4140 * suboptimal cache aliasing properties in some cases. 4142 * suboptimal cache aliasing properties in some cases.
4141 * 4143 *
4142 * For example if 2 tasks are alternately allocating 4144 * For example if 2 tasks are alternately allocating
4143 * batches of pages, one task can end up with a lot 4145 * batches of pages, one task can end up with a lot
4144 * of pages of one half of the possible page colors 4146 * of pages of one half of the possible page colors
4145 * and the other with pages of the other colors. 4147 * and the other with pages of the other colors.
4146 */ 4148 */
4147 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4149 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4148 4150
4149 return batch; 4151 return batch;
4150 4152
4151 #else 4153 #else
4152 /* The deferral and batching of frees should be suppressed under NOMMU 4154 /* The deferral and batching of frees should be suppressed under NOMMU
4153 * conditions. 4155 * conditions.
4154 * 4156 *
4155 * The problem is that NOMMU needs to be able to allocate large chunks 4157 * The problem is that NOMMU needs to be able to allocate large chunks
4156 * of contiguous memory as there's no hardware page translation to 4158 * of contiguous memory as there's no hardware page translation to
4157 * assemble apparent contiguous memory from discontiguous pages. 4159 * assemble apparent contiguous memory from discontiguous pages.
4158 * 4160 *
4159 * Queueing large contiguous runs of pages for batching, however, 4161 * Queueing large contiguous runs of pages for batching, however,
4160 * causes the pages to actually be freed in smaller chunks. As there 4162 * causes the pages to actually be freed in smaller chunks. As there
4161 * can be a significant delay between the individual batches being 4163 * can be a significant delay between the individual batches being
4162 * recycled, this leads to the once large chunks of space being 4164 * recycled, this leads to the once large chunks of space being
4163 * fragmented and becoming unavailable for high-order allocations. 4165 * fragmented and becoming unavailable for high-order allocations.
4164 */ 4166 */
4165 return 0; 4167 return 0;
4166 #endif 4168 #endif
4167 } 4169 }
4168 4170
4169 /* 4171 /*
4170 * pcp->high and pcp->batch values are related and dependent on one another: 4172 * pcp->high and pcp->batch values are related and dependent on one another:
4171 * ->batch must never be higher then ->high. 4173 * ->batch must never be higher then ->high.
4172 * The following function updates them in a safe manner without read side 4174 * The following function updates them in a safe manner without read side
4173 * locking. 4175 * locking.
4174 * 4176 *
4175 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4177 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4176 * those fields changing asynchronously (acording the the above rule). 4178 * those fields changing asynchronously (acording the the above rule).
4177 * 4179 *
4178 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4180 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4179 * outside of boot time (or some other assurance that no concurrent updaters 4181 * outside of boot time (or some other assurance that no concurrent updaters
4180 * exist). 4182 * exist).
4181 */ 4183 */
4182 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4184 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4183 unsigned long batch) 4185 unsigned long batch)
4184 { 4186 {
4185 /* start with a fail safe value for batch */ 4187 /* start with a fail safe value for batch */
4186 pcp->batch = 1; 4188 pcp->batch = 1;
4187 smp_wmb(); 4189 smp_wmb();
4188 4190
4189 /* Update high, then batch, in order */ 4191 /* Update high, then batch, in order */
4190 pcp->high = high; 4192 pcp->high = high;
4191 smp_wmb(); 4193 smp_wmb();
4192 4194
4193 pcp->batch = batch; 4195 pcp->batch = batch;
4194 } 4196 }
4195 4197
4196 /* a companion to pageset_set_high() */ 4198 /* a companion to pageset_set_high() */
4197 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4199 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4198 { 4200 {
4199 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4201 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4200 } 4202 }
4201 4203
4202 static void pageset_init(struct per_cpu_pageset *p) 4204 static void pageset_init(struct per_cpu_pageset *p)
4203 { 4205 {
4204 struct per_cpu_pages *pcp; 4206 struct per_cpu_pages *pcp;
4205 int migratetype; 4207 int migratetype;
4206 4208
4207 memset(p, 0, sizeof(*p)); 4209 memset(p, 0, sizeof(*p));
4208 4210
4209 pcp = &p->pcp; 4211 pcp = &p->pcp;
4210 pcp->count = 0; 4212 pcp->count = 0;
4211 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4213 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4212 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4214 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4213 } 4215 }
4214 4216
4215 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4217 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4216 { 4218 {
4217 pageset_init(p); 4219 pageset_init(p);
4218 pageset_set_batch(p, batch); 4220 pageset_set_batch(p, batch);
4219 } 4221 }
4220 4222
4221 /* 4223 /*
4222 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4224 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4223 * to the value high for the pageset p. 4225 * to the value high for the pageset p.
4224 */ 4226 */
4225 static void pageset_set_high(struct per_cpu_pageset *p, 4227 static void pageset_set_high(struct per_cpu_pageset *p,
4226 unsigned long high) 4228 unsigned long high)
4227 { 4229 {
4228 unsigned long batch = max(1UL, high / 4); 4230 unsigned long batch = max(1UL, high / 4);
4229 if ((high / 4) > (PAGE_SHIFT * 8)) 4231 if ((high / 4) > (PAGE_SHIFT * 8))
4230 batch = PAGE_SHIFT * 8; 4232 batch = PAGE_SHIFT * 8;
4231 4233
4232 pageset_update(&p->pcp, high, batch); 4234 pageset_update(&p->pcp, high, batch);
4233 } 4235 }
4234 4236
4235 static void pageset_set_high_and_batch(struct zone *zone, 4237 static void pageset_set_high_and_batch(struct zone *zone,
4236 struct per_cpu_pageset *pcp) 4238 struct per_cpu_pageset *pcp)
4237 { 4239 {
4238 if (percpu_pagelist_fraction) 4240 if (percpu_pagelist_fraction)
4239 pageset_set_high(pcp, 4241 pageset_set_high(pcp,
4240 (zone->managed_pages / 4242 (zone->managed_pages /
4241 percpu_pagelist_fraction)); 4243 percpu_pagelist_fraction));
4242 else 4244 else
4243 pageset_set_batch(pcp, zone_batchsize(zone)); 4245 pageset_set_batch(pcp, zone_batchsize(zone));
4244 } 4246 }
4245 4247
4246 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4248 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4247 { 4249 {
4248 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4250 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4249 4251
4250 pageset_init(pcp); 4252 pageset_init(pcp);
4251 pageset_set_high_and_batch(zone, pcp); 4253 pageset_set_high_and_batch(zone, pcp);
4252 } 4254 }
4253 4255
4254 static void __meminit setup_zone_pageset(struct zone *zone) 4256 static void __meminit setup_zone_pageset(struct zone *zone)
4255 { 4257 {
4256 int cpu; 4258 int cpu;
4257 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4259 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4258 for_each_possible_cpu(cpu) 4260 for_each_possible_cpu(cpu)
4259 zone_pageset_init(zone, cpu); 4261 zone_pageset_init(zone, cpu);
4260 } 4262 }
4261 4263
4262 /* 4264 /*
4263 * Allocate per cpu pagesets and initialize them. 4265 * Allocate per cpu pagesets and initialize them.
4264 * Before this call only boot pagesets were available. 4266 * Before this call only boot pagesets were available.
4265 */ 4267 */
4266 void __init setup_per_cpu_pageset(void) 4268 void __init setup_per_cpu_pageset(void)
4267 { 4269 {
4268 struct zone *zone; 4270 struct zone *zone;
4269 4271
4270 for_each_populated_zone(zone) 4272 for_each_populated_zone(zone)
4271 setup_zone_pageset(zone); 4273 setup_zone_pageset(zone);
4272 } 4274 }
4273 4275
4274 static noinline __init_refok 4276 static noinline __init_refok
4275 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4277 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4276 { 4278 {
4277 int i; 4279 int i;
4278 struct pglist_data *pgdat = zone->zone_pgdat; 4280 struct pglist_data *pgdat = zone->zone_pgdat;
4279 size_t alloc_size; 4281 size_t alloc_size;
4280 4282
4281 /* 4283 /*
4282 * The per-page waitqueue mechanism uses hashed waitqueues 4284 * The per-page waitqueue mechanism uses hashed waitqueues
4283 * per zone. 4285 * per zone.
4284 */ 4286 */
4285 zone->wait_table_hash_nr_entries = 4287 zone->wait_table_hash_nr_entries =
4286 wait_table_hash_nr_entries(zone_size_pages); 4288 wait_table_hash_nr_entries(zone_size_pages);
4287 zone->wait_table_bits = 4289 zone->wait_table_bits =
4288 wait_table_bits(zone->wait_table_hash_nr_entries); 4290 wait_table_bits(zone->wait_table_hash_nr_entries);
4289 alloc_size = zone->wait_table_hash_nr_entries 4291 alloc_size = zone->wait_table_hash_nr_entries
4290 * sizeof(wait_queue_head_t); 4292 * sizeof(wait_queue_head_t);
4291 4293
4292 if (!slab_is_available()) { 4294 if (!slab_is_available()) {
4293 zone->wait_table = (wait_queue_head_t *) 4295 zone->wait_table = (wait_queue_head_t *)
4294 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4296 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4295 } else { 4297 } else {
4296 /* 4298 /*
4297 * This case means that a zone whose size was 0 gets new memory 4299 * This case means that a zone whose size was 0 gets new memory
4298 * via memory hot-add. 4300 * via memory hot-add.
4299 * But it may be the case that a new node was hot-added. In 4301 * But it may be the case that a new node was hot-added. In
4300 * this case vmalloc() will not be able to use this new node's 4302 * this case vmalloc() will not be able to use this new node's
4301 * memory - this wait_table must be initialized to use this new 4303 * memory - this wait_table must be initialized to use this new
4302 * node itself as well. 4304 * node itself as well.
4303 * To use this new node's memory, further consideration will be 4305 * To use this new node's memory, further consideration will be
4304 * necessary. 4306 * necessary.
4305 */ 4307 */
4306 zone->wait_table = vmalloc(alloc_size); 4308 zone->wait_table = vmalloc(alloc_size);
4307 } 4309 }
4308 if (!zone->wait_table) 4310 if (!zone->wait_table)
4309 return -ENOMEM; 4311 return -ENOMEM;
4310 4312
4311 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4313 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4312 init_waitqueue_head(zone->wait_table + i); 4314 init_waitqueue_head(zone->wait_table + i);
4313 4315
4314 return 0; 4316 return 0;
4315 } 4317 }
4316 4318
4317 static __meminit void zone_pcp_init(struct zone *zone) 4319 static __meminit void zone_pcp_init(struct zone *zone)
4318 { 4320 {
4319 /* 4321 /*
4320 * per cpu subsystem is not up at this point. The following code 4322 * per cpu subsystem is not up at this point. The following code
4321 * relies on the ability of the linker to provide the 4323 * relies on the ability of the linker to provide the
4322 * offset of a (static) per cpu variable into the per cpu area. 4324 * offset of a (static) per cpu variable into the per cpu area.
4323 */ 4325 */
4324 zone->pageset = &boot_pageset; 4326 zone->pageset = &boot_pageset;
4325 4327
4326 if (zone->present_pages) 4328 if (zone->present_pages)
4327 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4329 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4328 zone->name, zone->present_pages, 4330 zone->name, zone->present_pages,
4329 zone_batchsize(zone)); 4331 zone_batchsize(zone));
4330 } 4332 }
4331 4333
4332 int __meminit init_currently_empty_zone(struct zone *zone, 4334 int __meminit init_currently_empty_zone(struct zone *zone,
4333 unsigned long zone_start_pfn, 4335 unsigned long zone_start_pfn,
4334 unsigned long size, 4336 unsigned long size,
4335 enum memmap_context context) 4337 enum memmap_context context)
4336 { 4338 {
4337 struct pglist_data *pgdat = zone->zone_pgdat; 4339 struct pglist_data *pgdat = zone->zone_pgdat;
4338 int ret; 4340 int ret;
4339 ret = zone_wait_table_init(zone, size); 4341 ret = zone_wait_table_init(zone, size);
4340 if (ret) 4342 if (ret)
4341 return ret; 4343 return ret;
4342 pgdat->nr_zones = zone_idx(zone) + 1; 4344 pgdat->nr_zones = zone_idx(zone) + 1;
4343 4345
4344 zone->zone_start_pfn = zone_start_pfn; 4346 zone->zone_start_pfn = zone_start_pfn;
4345 4347
4346 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4348 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4347 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4349 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4348 pgdat->node_id, 4350 pgdat->node_id,
4349 (unsigned long)zone_idx(zone), 4351 (unsigned long)zone_idx(zone),
4350 zone_start_pfn, (zone_start_pfn + size)); 4352 zone_start_pfn, (zone_start_pfn + size));
4351 4353
4352 zone_init_free_lists(zone); 4354 zone_init_free_lists(zone);
4353 4355
4354 return 0; 4356 return 0;
4355 } 4357 }
4356 4358
4357 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4359 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4358 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4360 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4359 /* 4361 /*
4360 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4362 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4361 * Architectures may implement their own version but if add_active_range() 4363 * Architectures may implement their own version but if add_active_range()
4362 * was used and there are no special requirements, this is a convenient 4364 * was used and there are no special requirements, this is a convenient
4363 * alternative 4365 * alternative
4364 */ 4366 */
4365 int __meminit __early_pfn_to_nid(unsigned long pfn) 4367 int __meminit __early_pfn_to_nid(unsigned long pfn)
4366 { 4368 {
4367 unsigned long start_pfn, end_pfn; 4369 unsigned long start_pfn, end_pfn;
4368 int nid; 4370 int nid;
4369 /* 4371 /*
4370 * NOTE: The following SMP-unsafe globals are only used early in boot 4372 * NOTE: The following SMP-unsafe globals are only used early in boot
4371 * when the kernel is running single-threaded. 4373 * when the kernel is running single-threaded.
4372 */ 4374 */
4373 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4375 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4374 static int __meminitdata last_nid; 4376 static int __meminitdata last_nid;
4375 4377
4376 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4378 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4377 return last_nid; 4379 return last_nid;
4378 4380
4379 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4381 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4380 if (nid != -1) { 4382 if (nid != -1) {
4381 last_start_pfn = start_pfn; 4383 last_start_pfn = start_pfn;
4382 last_end_pfn = end_pfn; 4384 last_end_pfn = end_pfn;
4383 last_nid = nid; 4385 last_nid = nid;
4384 } 4386 }
4385 4387
4386 return nid; 4388 return nid;
4387 } 4389 }
4388 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4390 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4389 4391
4390 int __meminit early_pfn_to_nid(unsigned long pfn) 4392 int __meminit early_pfn_to_nid(unsigned long pfn)
4391 { 4393 {
4392 int nid; 4394 int nid;
4393 4395
4394 nid = __early_pfn_to_nid(pfn); 4396 nid = __early_pfn_to_nid(pfn);
4395 if (nid >= 0) 4397 if (nid >= 0)
4396 return nid; 4398 return nid;
4397 /* just returns 0 */ 4399 /* just returns 0 */
4398 return 0; 4400 return 0;
4399 } 4401 }
4400 4402
4401 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4403 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4402 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4404 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4403 { 4405 {
4404 int nid; 4406 int nid;
4405 4407
4406 nid = __early_pfn_to_nid(pfn); 4408 nid = __early_pfn_to_nid(pfn);
4407 if (nid >= 0 && nid != node) 4409 if (nid >= 0 && nid != node)
4408 return false; 4410 return false;
4409 return true; 4411 return true;
4410 } 4412 }
4411 #endif 4413 #endif
4412 4414
4413 /** 4415 /**
4414 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4416 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4415 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4417 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4416 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4418 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4417 * 4419 *
4418 * If an architecture guarantees that all ranges registered with 4420 * If an architecture guarantees that all ranges registered with
4419 * add_active_ranges() contain no holes and may be freed, this 4421 * add_active_ranges() contain no holes and may be freed, this
4420 * this function may be used instead of calling free_bootmem() manually. 4422 * this function may be used instead of calling free_bootmem() manually.
4421 */ 4423 */
4422 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4424 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4423 { 4425 {
4424 unsigned long start_pfn, end_pfn; 4426 unsigned long start_pfn, end_pfn;
4425 int i, this_nid; 4427 int i, this_nid;
4426 4428
4427 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4429 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4428 start_pfn = min(start_pfn, max_low_pfn); 4430 start_pfn = min(start_pfn, max_low_pfn);
4429 end_pfn = min(end_pfn, max_low_pfn); 4431 end_pfn = min(end_pfn, max_low_pfn);
4430 4432
4431 if (start_pfn < end_pfn) 4433 if (start_pfn < end_pfn)
4432 free_bootmem_node(NODE_DATA(this_nid), 4434 free_bootmem_node(NODE_DATA(this_nid),
4433 PFN_PHYS(start_pfn), 4435 PFN_PHYS(start_pfn),
4434 (end_pfn - start_pfn) << PAGE_SHIFT); 4436 (end_pfn - start_pfn) << PAGE_SHIFT);
4435 } 4437 }
4436 } 4438 }
4437 4439
4438 /** 4440 /**
4439 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4441 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4440 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4442 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4441 * 4443 *
4442 * If an architecture guarantees that all ranges registered with 4444 * If an architecture guarantees that all ranges registered with
4443 * add_active_ranges() contain no holes and may be freed, this 4445 * add_active_ranges() contain no holes and may be freed, this
4444 * function may be used instead of calling memory_present() manually. 4446 * function may be used instead of calling memory_present() manually.
4445 */ 4447 */
4446 void __init sparse_memory_present_with_active_regions(int nid) 4448 void __init sparse_memory_present_with_active_regions(int nid)
4447 { 4449 {
4448 unsigned long start_pfn, end_pfn; 4450 unsigned long start_pfn, end_pfn;
4449 int i, this_nid; 4451 int i, this_nid;
4450 4452
4451 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4453 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4452 memory_present(this_nid, start_pfn, end_pfn); 4454 memory_present(this_nid, start_pfn, end_pfn);
4453 } 4455 }
4454 4456
4455 /** 4457 /**
4456 * get_pfn_range_for_nid - Return the start and end page frames for a node 4458 * get_pfn_range_for_nid - Return the start and end page frames for a node
4457 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4459 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4458 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4460 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4459 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4461 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4460 * 4462 *
4461 * It returns the start and end page frame of a node based on information 4463 * It returns the start and end page frame of a node based on information
4462 * provided by an arch calling add_active_range(). If called for a node 4464 * provided by an arch calling add_active_range(). If called for a node
4463 * with no available memory, a warning is printed and the start and end 4465 * with no available memory, a warning is printed and the start and end
4464 * PFNs will be 0. 4466 * PFNs will be 0.
4465 */ 4467 */
4466 void __meminit get_pfn_range_for_nid(unsigned int nid, 4468 void __meminit get_pfn_range_for_nid(unsigned int nid,
4467 unsigned long *start_pfn, unsigned long *end_pfn) 4469 unsigned long *start_pfn, unsigned long *end_pfn)
4468 { 4470 {
4469 unsigned long this_start_pfn, this_end_pfn; 4471 unsigned long this_start_pfn, this_end_pfn;
4470 int i; 4472 int i;
4471 4473
4472 *start_pfn = -1UL; 4474 *start_pfn = -1UL;
4473 *end_pfn = 0; 4475 *end_pfn = 0;
4474 4476
4475 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4477 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4476 *start_pfn = min(*start_pfn, this_start_pfn); 4478 *start_pfn = min(*start_pfn, this_start_pfn);
4477 *end_pfn = max(*end_pfn, this_end_pfn); 4479 *end_pfn = max(*end_pfn, this_end_pfn);
4478 } 4480 }
4479 4481
4480 if (*start_pfn == -1UL) 4482 if (*start_pfn == -1UL)
4481 *start_pfn = 0; 4483 *start_pfn = 0;
4482 } 4484 }
4483 4485
4484 /* 4486 /*
4485 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4487 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4486 * assumption is made that zones within a node are ordered in monotonic 4488 * assumption is made that zones within a node are ordered in monotonic
4487 * increasing memory addresses so that the "highest" populated zone is used 4489 * increasing memory addresses so that the "highest" populated zone is used
4488 */ 4490 */
4489 static void __init find_usable_zone_for_movable(void) 4491 static void __init find_usable_zone_for_movable(void)
4490 { 4492 {
4491 int zone_index; 4493 int zone_index;
4492 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4494 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4493 if (zone_index == ZONE_MOVABLE) 4495 if (zone_index == ZONE_MOVABLE)
4494 continue; 4496 continue;
4495 4497
4496 if (arch_zone_highest_possible_pfn[zone_index] > 4498 if (arch_zone_highest_possible_pfn[zone_index] >
4497 arch_zone_lowest_possible_pfn[zone_index]) 4499 arch_zone_lowest_possible_pfn[zone_index])
4498 break; 4500 break;
4499 } 4501 }
4500 4502
4501 VM_BUG_ON(zone_index == -1); 4503 VM_BUG_ON(zone_index == -1);
4502 movable_zone = zone_index; 4504 movable_zone = zone_index;
4503 } 4505 }
4504 4506
4505 /* 4507 /*
4506 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4508 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4507 * because it is sized independent of architecture. Unlike the other zones, 4509 * because it is sized independent of architecture. Unlike the other zones,
4508 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4510 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4509 * in each node depending on the size of each node and how evenly kernelcore 4511 * in each node depending on the size of each node and how evenly kernelcore
4510 * is distributed. This helper function adjusts the zone ranges 4512 * is distributed. This helper function adjusts the zone ranges
4511 * provided by the architecture for a given node by using the end of the 4513 * provided by the architecture for a given node by using the end of the
4512 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4514 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4513 * zones within a node are in order of monotonic increases memory addresses 4515 * zones within a node are in order of monotonic increases memory addresses
4514 */ 4516 */
4515 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4517 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4516 unsigned long zone_type, 4518 unsigned long zone_type,
4517 unsigned long node_start_pfn, 4519 unsigned long node_start_pfn,
4518 unsigned long node_end_pfn, 4520 unsigned long node_end_pfn,
4519 unsigned long *zone_start_pfn, 4521 unsigned long *zone_start_pfn,
4520 unsigned long *zone_end_pfn) 4522 unsigned long *zone_end_pfn)
4521 { 4523 {
4522 /* Only adjust if ZONE_MOVABLE is on this node */ 4524 /* Only adjust if ZONE_MOVABLE is on this node */
4523 if (zone_movable_pfn[nid]) { 4525 if (zone_movable_pfn[nid]) {
4524 /* Size ZONE_MOVABLE */ 4526 /* Size ZONE_MOVABLE */
4525 if (zone_type == ZONE_MOVABLE) { 4527 if (zone_type == ZONE_MOVABLE) {
4526 *zone_start_pfn = zone_movable_pfn[nid]; 4528 *zone_start_pfn = zone_movable_pfn[nid];
4527 *zone_end_pfn = min(node_end_pfn, 4529 *zone_end_pfn = min(node_end_pfn,
4528 arch_zone_highest_possible_pfn[movable_zone]); 4530 arch_zone_highest_possible_pfn[movable_zone]);
4529 4531
4530 /* Adjust for ZONE_MOVABLE starting within this range */ 4532 /* Adjust for ZONE_MOVABLE starting within this range */
4531 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4533 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4532 *zone_end_pfn > zone_movable_pfn[nid]) { 4534 *zone_end_pfn > zone_movable_pfn[nid]) {
4533 *zone_end_pfn = zone_movable_pfn[nid]; 4535 *zone_end_pfn = zone_movable_pfn[nid];
4534 4536
4535 /* Check if this whole range is within ZONE_MOVABLE */ 4537 /* Check if this whole range is within ZONE_MOVABLE */
4536 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4538 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4537 *zone_start_pfn = *zone_end_pfn; 4539 *zone_start_pfn = *zone_end_pfn;
4538 } 4540 }
4539 } 4541 }
4540 4542
4541 /* 4543 /*
4542 * Return the number of pages a zone spans in a node, including holes 4544 * Return the number of pages a zone spans in a node, including holes
4543 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4545 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4544 */ 4546 */
4545 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4547 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4546 unsigned long zone_type, 4548 unsigned long zone_type,
4547 unsigned long node_start_pfn, 4549 unsigned long node_start_pfn,
4548 unsigned long node_end_pfn, 4550 unsigned long node_end_pfn,
4549 unsigned long *ignored) 4551 unsigned long *ignored)
4550 { 4552 {
4551 unsigned long zone_start_pfn, zone_end_pfn; 4553 unsigned long zone_start_pfn, zone_end_pfn;
4552 4554
4553 /* Get the start and end of the zone */ 4555 /* Get the start and end of the zone */
4554 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4556 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4555 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4557 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4556 adjust_zone_range_for_zone_movable(nid, zone_type, 4558 adjust_zone_range_for_zone_movable(nid, zone_type,
4557 node_start_pfn, node_end_pfn, 4559 node_start_pfn, node_end_pfn,
4558 &zone_start_pfn, &zone_end_pfn); 4560 &zone_start_pfn, &zone_end_pfn);
4559 4561
4560 /* Check that this node has pages within the zone's required range */ 4562 /* Check that this node has pages within the zone's required range */
4561 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4563 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4562 return 0; 4564 return 0;
4563 4565
4564 /* Move the zone boundaries inside the node if necessary */ 4566 /* Move the zone boundaries inside the node if necessary */
4565 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4567 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4566 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4568 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4567 4569
4568 /* Return the spanned pages */ 4570 /* Return the spanned pages */
4569 return zone_end_pfn - zone_start_pfn; 4571 return zone_end_pfn - zone_start_pfn;
4570 } 4572 }
4571 4573
4572 /* 4574 /*
4573 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4575 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4574 * then all holes in the requested range will be accounted for. 4576 * then all holes in the requested range will be accounted for.
4575 */ 4577 */
4576 unsigned long __meminit __absent_pages_in_range(int nid, 4578 unsigned long __meminit __absent_pages_in_range(int nid,
4577 unsigned long range_start_pfn, 4579 unsigned long range_start_pfn,
4578 unsigned long range_end_pfn) 4580 unsigned long range_end_pfn)
4579 { 4581 {
4580 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4582 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4581 unsigned long start_pfn, end_pfn; 4583 unsigned long start_pfn, end_pfn;
4582 int i; 4584 int i;
4583 4585
4584 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4586 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4585 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4587 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4586 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4588 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4587 nr_absent -= end_pfn - start_pfn; 4589 nr_absent -= end_pfn - start_pfn;
4588 } 4590 }
4589 return nr_absent; 4591 return nr_absent;
4590 } 4592 }
4591 4593
4592 /** 4594 /**
4593 * absent_pages_in_range - Return number of page frames in holes within a range 4595 * absent_pages_in_range - Return number of page frames in holes within a range
4594 * @start_pfn: The start PFN to start searching for holes 4596 * @start_pfn: The start PFN to start searching for holes
4595 * @end_pfn: The end PFN to stop searching for holes 4597 * @end_pfn: The end PFN to stop searching for holes
4596 * 4598 *
4597 * It returns the number of pages frames in memory holes within a range. 4599 * It returns the number of pages frames in memory holes within a range.
4598 */ 4600 */
4599 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4601 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4600 unsigned long end_pfn) 4602 unsigned long end_pfn)
4601 { 4603 {
4602 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4604 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4603 } 4605 }
4604 4606
4605 /* Return the number of page frames in holes in a zone on a node */ 4607 /* Return the number of page frames in holes in a zone on a node */
4606 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4608 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4607 unsigned long zone_type, 4609 unsigned long zone_type,
4608 unsigned long node_start_pfn, 4610 unsigned long node_start_pfn,
4609 unsigned long node_end_pfn, 4611 unsigned long node_end_pfn,
4610 unsigned long *ignored) 4612 unsigned long *ignored)
4611 { 4613 {
4612 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4614 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4613 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4615 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4614 unsigned long zone_start_pfn, zone_end_pfn; 4616 unsigned long zone_start_pfn, zone_end_pfn;
4615 4617
4616 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4618 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4617 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4619 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4618 4620
4619 adjust_zone_range_for_zone_movable(nid, zone_type, 4621 adjust_zone_range_for_zone_movable(nid, zone_type,
4620 node_start_pfn, node_end_pfn, 4622 node_start_pfn, node_end_pfn,
4621 &zone_start_pfn, &zone_end_pfn); 4623 &zone_start_pfn, &zone_end_pfn);
4622 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4624 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4623 } 4625 }
4624 4626
4625 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4627 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4626 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4628 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4627 unsigned long zone_type, 4629 unsigned long zone_type,
4628 unsigned long node_start_pfn, 4630 unsigned long node_start_pfn,
4629 unsigned long node_end_pfn, 4631 unsigned long node_end_pfn,
4630 unsigned long *zones_size) 4632 unsigned long *zones_size)
4631 { 4633 {
4632 return zones_size[zone_type]; 4634 return zones_size[zone_type];
4633 } 4635 }
4634 4636
4635 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4637 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4636 unsigned long zone_type, 4638 unsigned long zone_type,
4637 unsigned long node_start_pfn, 4639 unsigned long node_start_pfn,
4638 unsigned long node_end_pfn, 4640 unsigned long node_end_pfn,
4639 unsigned long *zholes_size) 4641 unsigned long *zholes_size)
4640 { 4642 {
4641 if (!zholes_size) 4643 if (!zholes_size)
4642 return 0; 4644 return 0;
4643 4645
4644 return zholes_size[zone_type]; 4646 return zholes_size[zone_type];
4645 } 4647 }
4646 4648
4647 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4649 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4648 4650
4649 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4651 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4650 unsigned long node_start_pfn, 4652 unsigned long node_start_pfn,
4651 unsigned long node_end_pfn, 4653 unsigned long node_end_pfn,
4652 unsigned long *zones_size, 4654 unsigned long *zones_size,
4653 unsigned long *zholes_size) 4655 unsigned long *zholes_size)
4654 { 4656 {
4655 unsigned long realtotalpages, totalpages = 0; 4657 unsigned long realtotalpages, totalpages = 0;
4656 enum zone_type i; 4658 enum zone_type i;
4657 4659
4658 for (i = 0; i < MAX_NR_ZONES; i++) 4660 for (i = 0; i < MAX_NR_ZONES; i++)
4659 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4661 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4660 node_start_pfn, 4662 node_start_pfn,
4661 node_end_pfn, 4663 node_end_pfn,
4662 zones_size); 4664 zones_size);
4663 pgdat->node_spanned_pages = totalpages; 4665 pgdat->node_spanned_pages = totalpages;
4664 4666
4665 realtotalpages = totalpages; 4667 realtotalpages = totalpages;
4666 for (i = 0; i < MAX_NR_ZONES; i++) 4668 for (i = 0; i < MAX_NR_ZONES; i++)
4667 realtotalpages -= 4669 realtotalpages -=
4668 zone_absent_pages_in_node(pgdat->node_id, i, 4670 zone_absent_pages_in_node(pgdat->node_id, i,
4669 node_start_pfn, node_end_pfn, 4671 node_start_pfn, node_end_pfn,
4670 zholes_size); 4672 zholes_size);
4671 pgdat->node_present_pages = realtotalpages; 4673 pgdat->node_present_pages = realtotalpages;
4672 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4674 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4673 realtotalpages); 4675 realtotalpages);
4674 } 4676 }
4675 4677
4676 #ifndef CONFIG_SPARSEMEM 4678 #ifndef CONFIG_SPARSEMEM
4677 /* 4679 /*
4678 * Calculate the size of the zone->blockflags rounded to an unsigned long 4680 * Calculate the size of the zone->blockflags rounded to an unsigned long
4679 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4681 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4680 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4682 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4681 * round what is now in bits to nearest long in bits, then return it in 4683 * round what is now in bits to nearest long in bits, then return it in
4682 * bytes. 4684 * bytes.
4683 */ 4685 */
4684 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4686 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4685 { 4687 {
4686 unsigned long usemapsize; 4688 unsigned long usemapsize;
4687 4689
4688 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4690 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4689 usemapsize = roundup(zonesize, pageblock_nr_pages); 4691 usemapsize = roundup(zonesize, pageblock_nr_pages);
4690 usemapsize = usemapsize >> pageblock_order; 4692 usemapsize = usemapsize >> pageblock_order;
4691 usemapsize *= NR_PAGEBLOCK_BITS; 4693 usemapsize *= NR_PAGEBLOCK_BITS;
4692 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4694 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4693 4695
4694 return usemapsize / 8; 4696 return usemapsize / 8;
4695 } 4697 }
4696 4698
4697 static void __init setup_usemap(struct pglist_data *pgdat, 4699 static void __init setup_usemap(struct pglist_data *pgdat,
4698 struct zone *zone, 4700 struct zone *zone,
4699 unsigned long zone_start_pfn, 4701 unsigned long zone_start_pfn,
4700 unsigned long zonesize) 4702 unsigned long zonesize)
4701 { 4703 {
4702 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4704 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4703 zone->pageblock_flags = NULL; 4705 zone->pageblock_flags = NULL;
4704 if (usemapsize) 4706 if (usemapsize)
4705 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4707 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4706 usemapsize); 4708 usemapsize);
4707 } 4709 }
4708 #else 4710 #else
4709 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4711 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4710 unsigned long zone_start_pfn, unsigned long zonesize) {} 4712 unsigned long zone_start_pfn, unsigned long zonesize) {}
4711 #endif /* CONFIG_SPARSEMEM */ 4713 #endif /* CONFIG_SPARSEMEM */
4712 4714
4713 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4715 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4714 4716
4715 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4717 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4716 void __paginginit set_pageblock_order(void) 4718 void __paginginit set_pageblock_order(void)
4717 { 4719 {
4718 unsigned int order; 4720 unsigned int order;
4719 4721
4720 /* Check that pageblock_nr_pages has not already been setup */ 4722 /* Check that pageblock_nr_pages has not already been setup */
4721 if (pageblock_order) 4723 if (pageblock_order)
4722 return; 4724 return;
4723 4725
4724 if (HPAGE_SHIFT > PAGE_SHIFT) 4726 if (HPAGE_SHIFT > PAGE_SHIFT)
4725 order = HUGETLB_PAGE_ORDER; 4727 order = HUGETLB_PAGE_ORDER;
4726 else 4728 else
4727 order = MAX_ORDER - 1; 4729 order = MAX_ORDER - 1;
4728 4730
4729 /* 4731 /*
4730 * Assume the largest contiguous order of interest is a huge page. 4732 * Assume the largest contiguous order of interest is a huge page.
4731 * This value may be variable depending on boot parameters on IA64 and 4733 * This value may be variable depending on boot parameters on IA64 and
4732 * powerpc. 4734 * powerpc.
4733 */ 4735 */
4734 pageblock_order = order; 4736 pageblock_order = order;
4735 } 4737 }
4736 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4738 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4737 4739
4738 /* 4740 /*
4739 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4741 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4740 * is unused as pageblock_order is set at compile-time. See 4742 * is unused as pageblock_order is set at compile-time. See
4741 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4743 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4742 * the kernel config 4744 * the kernel config
4743 */ 4745 */
4744 void __paginginit set_pageblock_order(void) 4746 void __paginginit set_pageblock_order(void)
4745 { 4747 {
4746 } 4748 }
4747 4749
4748 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4750 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4749 4751
4750 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4752 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4751 unsigned long present_pages) 4753 unsigned long present_pages)
4752 { 4754 {
4753 unsigned long pages = spanned_pages; 4755 unsigned long pages = spanned_pages;
4754 4756
4755 /* 4757 /*
4756 * Provide a more accurate estimation if there are holes within 4758 * Provide a more accurate estimation if there are holes within
4757 * the zone and SPARSEMEM is in use. If there are holes within the 4759 * the zone and SPARSEMEM is in use. If there are holes within the
4758 * zone, each populated memory region may cost us one or two extra 4760 * zone, each populated memory region may cost us one or two extra
4759 * memmap pages due to alignment because memmap pages for each 4761 * memmap pages due to alignment because memmap pages for each
4760 * populated regions may not naturally algined on page boundary. 4762 * populated regions may not naturally algined on page boundary.
4761 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4763 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4762 */ 4764 */
4763 if (spanned_pages > present_pages + (present_pages >> 4) && 4765 if (spanned_pages > present_pages + (present_pages >> 4) &&
4764 IS_ENABLED(CONFIG_SPARSEMEM)) 4766 IS_ENABLED(CONFIG_SPARSEMEM))
4765 pages = present_pages; 4767 pages = present_pages;
4766 4768
4767 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4769 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4768 } 4770 }
4769 4771
4770 /* 4772 /*
4771 * Set up the zone data structures: 4773 * Set up the zone data structures:
4772 * - mark all pages reserved 4774 * - mark all pages reserved
4773 * - mark all memory queues empty 4775 * - mark all memory queues empty
4774 * - clear the memory bitmaps 4776 * - clear the memory bitmaps
4775 * 4777 *
4776 * NOTE: pgdat should get zeroed by caller. 4778 * NOTE: pgdat should get zeroed by caller.
4777 */ 4779 */
4778 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4780 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4779 unsigned long node_start_pfn, unsigned long node_end_pfn, 4781 unsigned long node_start_pfn, unsigned long node_end_pfn,
4780 unsigned long *zones_size, unsigned long *zholes_size) 4782 unsigned long *zones_size, unsigned long *zholes_size)
4781 { 4783 {
4782 enum zone_type j; 4784 enum zone_type j;
4783 int nid = pgdat->node_id; 4785 int nid = pgdat->node_id;
4784 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4786 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4785 int ret; 4787 int ret;
4786 4788
4787 pgdat_resize_init(pgdat); 4789 pgdat_resize_init(pgdat);
4788 #ifdef CONFIG_NUMA_BALANCING 4790 #ifdef CONFIG_NUMA_BALANCING
4789 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4791 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4790 pgdat->numabalancing_migrate_nr_pages = 0; 4792 pgdat->numabalancing_migrate_nr_pages = 0;
4791 pgdat->numabalancing_migrate_next_window = jiffies; 4793 pgdat->numabalancing_migrate_next_window = jiffies;
4792 #endif 4794 #endif
4793 init_waitqueue_head(&pgdat->kswapd_wait); 4795 init_waitqueue_head(&pgdat->kswapd_wait);
4794 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4796 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4795 pgdat_page_cgroup_init(pgdat); 4797 pgdat_page_cgroup_init(pgdat);
4796 4798
4797 for (j = 0; j < MAX_NR_ZONES; j++) { 4799 for (j = 0; j < MAX_NR_ZONES; j++) {
4798 struct zone *zone = pgdat->node_zones + j; 4800 struct zone *zone = pgdat->node_zones + j;
4799 unsigned long size, realsize, freesize, memmap_pages; 4801 unsigned long size, realsize, freesize, memmap_pages;
4800 4802
4801 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4803 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4802 node_end_pfn, zones_size); 4804 node_end_pfn, zones_size);
4803 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4805 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4804 node_start_pfn, 4806 node_start_pfn,
4805 node_end_pfn, 4807 node_end_pfn,
4806 zholes_size); 4808 zholes_size);
4807 4809
4808 /* 4810 /*
4809 * Adjust freesize so that it accounts for how much memory 4811 * Adjust freesize so that it accounts for how much memory
4810 * is used by this zone for memmap. This affects the watermark 4812 * is used by this zone for memmap. This affects the watermark
4811 * and per-cpu initialisations 4813 * and per-cpu initialisations
4812 */ 4814 */
4813 memmap_pages = calc_memmap_size(size, realsize); 4815 memmap_pages = calc_memmap_size(size, realsize);
4814 if (freesize >= memmap_pages) { 4816 if (freesize >= memmap_pages) {
4815 freesize -= memmap_pages; 4817 freesize -= memmap_pages;
4816 if (memmap_pages) 4818 if (memmap_pages)
4817 printk(KERN_DEBUG 4819 printk(KERN_DEBUG
4818 " %s zone: %lu pages used for memmap\n", 4820 " %s zone: %lu pages used for memmap\n",
4819 zone_names[j], memmap_pages); 4821 zone_names[j], memmap_pages);
4820 } else 4822 } else
4821 printk(KERN_WARNING 4823 printk(KERN_WARNING
4822 " %s zone: %lu pages exceeds freesize %lu\n", 4824 " %s zone: %lu pages exceeds freesize %lu\n",
4823 zone_names[j], memmap_pages, freesize); 4825 zone_names[j], memmap_pages, freesize);
4824 4826
4825 /* Account for reserved pages */ 4827 /* Account for reserved pages */
4826 if (j == 0 && freesize > dma_reserve) { 4828 if (j == 0 && freesize > dma_reserve) {
4827 freesize -= dma_reserve; 4829 freesize -= dma_reserve;
4828 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4830 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4829 zone_names[0], dma_reserve); 4831 zone_names[0], dma_reserve);
4830 } 4832 }
4831 4833
4832 if (!is_highmem_idx(j)) 4834 if (!is_highmem_idx(j))
4833 nr_kernel_pages += freesize; 4835 nr_kernel_pages += freesize;
4834 /* Charge for highmem memmap if there are enough kernel pages */ 4836 /* Charge for highmem memmap if there are enough kernel pages */
4835 else if (nr_kernel_pages > memmap_pages * 2) 4837 else if (nr_kernel_pages > memmap_pages * 2)
4836 nr_kernel_pages -= memmap_pages; 4838 nr_kernel_pages -= memmap_pages;
4837 nr_all_pages += freesize; 4839 nr_all_pages += freesize;
4838 4840
4839 zone->spanned_pages = size; 4841 zone->spanned_pages = size;
4840 zone->present_pages = realsize; 4842 zone->present_pages = realsize;
4841 /* 4843 /*
4842 * Set an approximate value for lowmem here, it will be adjusted 4844 * Set an approximate value for lowmem here, it will be adjusted
4843 * when the bootmem allocator frees pages into the buddy system. 4845 * when the bootmem allocator frees pages into the buddy system.
4844 * And all highmem pages will be managed by the buddy system. 4846 * And all highmem pages will be managed by the buddy system.
4845 */ 4847 */
4846 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4848 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4847 #ifdef CONFIG_NUMA 4849 #ifdef CONFIG_NUMA
4848 zone->node = nid; 4850 zone->node = nid;
4849 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4851 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4850 / 100; 4852 / 100;
4851 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4853 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4852 #endif 4854 #endif
4853 zone->name = zone_names[j]; 4855 zone->name = zone_names[j];
4854 spin_lock_init(&zone->lock); 4856 spin_lock_init(&zone->lock);
4855 spin_lock_init(&zone->lru_lock); 4857 spin_lock_init(&zone->lru_lock);
4856 zone_seqlock_init(zone); 4858 zone_seqlock_init(zone);
4857 zone->zone_pgdat = pgdat; 4859 zone->zone_pgdat = pgdat;
4858 zone_pcp_init(zone); 4860 zone_pcp_init(zone);
4859 4861
4860 /* For bootup, initialized properly in watermark setup */ 4862 /* For bootup, initialized properly in watermark setup */
4861 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4863 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4862 4864
4863 lruvec_init(&zone->lruvec); 4865 lruvec_init(&zone->lruvec);
4864 if (!size) 4866 if (!size)
4865 continue; 4867 continue;
4866 4868
4867 set_pageblock_order(); 4869 set_pageblock_order();
4868 setup_usemap(pgdat, zone, zone_start_pfn, size); 4870 setup_usemap(pgdat, zone, zone_start_pfn, size);
4869 ret = init_currently_empty_zone(zone, zone_start_pfn, 4871 ret = init_currently_empty_zone(zone, zone_start_pfn,
4870 size, MEMMAP_EARLY); 4872 size, MEMMAP_EARLY);
4871 BUG_ON(ret); 4873 BUG_ON(ret);
4872 memmap_init(size, nid, j, zone_start_pfn); 4874 memmap_init(size, nid, j, zone_start_pfn);
4873 zone_start_pfn += size; 4875 zone_start_pfn += size;
4874 } 4876 }
4875 } 4877 }
4876 4878
4877 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4879 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4878 { 4880 {
4879 /* Skip empty nodes */ 4881 /* Skip empty nodes */
4880 if (!pgdat->node_spanned_pages) 4882 if (!pgdat->node_spanned_pages)
4881 return; 4883 return;
4882 4884
4883 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4885 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4884 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4886 /* ia64 gets its own node_mem_map, before this, without bootmem */
4885 if (!pgdat->node_mem_map) { 4887 if (!pgdat->node_mem_map) {
4886 unsigned long size, start, end; 4888 unsigned long size, start, end;
4887 struct page *map; 4889 struct page *map;
4888 4890
4889 /* 4891 /*
4890 * The zone's endpoints aren't required to be MAX_ORDER 4892 * The zone's endpoints aren't required to be MAX_ORDER
4891 * aligned but the node_mem_map endpoints must be in order 4893 * aligned but the node_mem_map endpoints must be in order
4892 * for the buddy allocator to function correctly. 4894 * for the buddy allocator to function correctly.
4893 */ 4895 */
4894 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4896 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4895 end = pgdat_end_pfn(pgdat); 4897 end = pgdat_end_pfn(pgdat);
4896 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4898 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4897 size = (end - start) * sizeof(struct page); 4899 size = (end - start) * sizeof(struct page);
4898 map = alloc_remap(pgdat->node_id, size); 4900 map = alloc_remap(pgdat->node_id, size);
4899 if (!map) 4901 if (!map)
4900 map = alloc_bootmem_node_nopanic(pgdat, size); 4902 map = alloc_bootmem_node_nopanic(pgdat, size);
4901 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4903 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4902 } 4904 }
4903 #ifndef CONFIG_NEED_MULTIPLE_NODES 4905 #ifndef CONFIG_NEED_MULTIPLE_NODES
4904 /* 4906 /*
4905 * With no DISCONTIG, the global mem_map is just set as node 0's 4907 * With no DISCONTIG, the global mem_map is just set as node 0's
4906 */ 4908 */
4907 if (pgdat == NODE_DATA(0)) { 4909 if (pgdat == NODE_DATA(0)) {
4908 mem_map = NODE_DATA(0)->node_mem_map; 4910 mem_map = NODE_DATA(0)->node_mem_map;
4909 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4911 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4910 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4912 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4911 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4913 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4912 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4914 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4913 } 4915 }
4914 #endif 4916 #endif
4915 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4917 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4916 } 4918 }
4917 4919
4918 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4920 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4919 unsigned long node_start_pfn, unsigned long *zholes_size) 4921 unsigned long node_start_pfn, unsigned long *zholes_size)
4920 { 4922 {
4921 pg_data_t *pgdat = NODE_DATA(nid); 4923 pg_data_t *pgdat = NODE_DATA(nid);
4922 unsigned long start_pfn = 0; 4924 unsigned long start_pfn = 0;
4923 unsigned long end_pfn = 0; 4925 unsigned long end_pfn = 0;
4924 4926
4925 /* pg_data_t should be reset to zero when it's allocated */ 4927 /* pg_data_t should be reset to zero when it's allocated */
4926 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4928 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4927 4929
4928 pgdat->node_id = nid; 4930 pgdat->node_id = nid;
4929 pgdat->node_start_pfn = node_start_pfn; 4931 pgdat->node_start_pfn = node_start_pfn;
4930 if (node_state(nid, N_MEMORY)) 4932 if (node_state(nid, N_MEMORY))
4931 init_zone_allows_reclaim(nid); 4933 init_zone_allows_reclaim(nid);
4932 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4934 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4933 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4935 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4934 #endif 4936 #endif
4935 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4937 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4936 zones_size, zholes_size); 4938 zones_size, zholes_size);
4937 4939
4938 alloc_node_mem_map(pgdat); 4940 alloc_node_mem_map(pgdat);
4939 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4941 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4940 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4942 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4941 nid, (unsigned long)pgdat, 4943 nid, (unsigned long)pgdat,
4942 (unsigned long)pgdat->node_mem_map); 4944 (unsigned long)pgdat->node_mem_map);
4943 #endif 4945 #endif
4944 4946
4945 free_area_init_core(pgdat, start_pfn, end_pfn, 4947 free_area_init_core(pgdat, start_pfn, end_pfn,
4946 zones_size, zholes_size); 4948 zones_size, zholes_size);
4947 } 4949 }
4948 4950
4949 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4951 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4950 4952
4951 #if MAX_NUMNODES > 1 4953 #if MAX_NUMNODES > 1
4952 /* 4954 /*
4953 * Figure out the number of possible node ids. 4955 * Figure out the number of possible node ids.
4954 */ 4956 */
4955 void __init setup_nr_node_ids(void) 4957 void __init setup_nr_node_ids(void)
4956 { 4958 {
4957 unsigned int node; 4959 unsigned int node;
4958 unsigned int highest = 0; 4960 unsigned int highest = 0;
4959 4961
4960 for_each_node_mask(node, node_possible_map) 4962 for_each_node_mask(node, node_possible_map)
4961 highest = node; 4963 highest = node;
4962 nr_node_ids = highest + 1; 4964 nr_node_ids = highest + 1;
4963 } 4965 }
4964 #endif 4966 #endif
4965 4967
4966 /** 4968 /**
4967 * node_map_pfn_alignment - determine the maximum internode alignment 4969 * node_map_pfn_alignment - determine the maximum internode alignment
4968 * 4970 *
4969 * This function should be called after node map is populated and sorted. 4971 * This function should be called after node map is populated and sorted.
4970 * It calculates the maximum power of two alignment which can distinguish 4972 * It calculates the maximum power of two alignment which can distinguish
4971 * all the nodes. 4973 * all the nodes.
4972 * 4974 *
4973 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4975 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4974 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4976 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4975 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4977 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4976 * shifted, 1GiB is enough and this function will indicate so. 4978 * shifted, 1GiB is enough and this function will indicate so.
4977 * 4979 *
4978 * This is used to test whether pfn -> nid mapping of the chosen memory 4980 * This is used to test whether pfn -> nid mapping of the chosen memory
4979 * model has fine enough granularity to avoid incorrect mapping for the 4981 * model has fine enough granularity to avoid incorrect mapping for the
4980 * populated node map. 4982 * populated node map.
4981 * 4983 *
4982 * Returns the determined alignment in pfn's. 0 if there is no alignment 4984 * Returns the determined alignment in pfn's. 0 if there is no alignment
4983 * requirement (single node). 4985 * requirement (single node).
4984 */ 4986 */
4985 unsigned long __init node_map_pfn_alignment(void) 4987 unsigned long __init node_map_pfn_alignment(void)
4986 { 4988 {
4987 unsigned long accl_mask = 0, last_end = 0; 4989 unsigned long accl_mask = 0, last_end = 0;
4988 unsigned long start, end, mask; 4990 unsigned long start, end, mask;
4989 int last_nid = -1; 4991 int last_nid = -1;
4990 int i, nid; 4992 int i, nid;
4991 4993
4992 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4994 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4993 if (!start || last_nid < 0 || last_nid == nid) { 4995 if (!start || last_nid < 0 || last_nid == nid) {
4994 last_nid = nid; 4996 last_nid = nid;
4995 last_end = end; 4997 last_end = end;
4996 continue; 4998 continue;
4997 } 4999 }
4998 5000
4999 /* 5001 /*
5000 * Start with a mask granular enough to pin-point to the 5002 * Start with a mask granular enough to pin-point to the
5001 * start pfn and tick off bits one-by-one until it becomes 5003 * start pfn and tick off bits one-by-one until it becomes
5002 * too coarse to separate the current node from the last. 5004 * too coarse to separate the current node from the last.
5003 */ 5005 */
5004 mask = ~((1 << __ffs(start)) - 1); 5006 mask = ~((1 << __ffs(start)) - 1);
5005 while (mask && last_end <= (start & (mask << 1))) 5007 while (mask && last_end <= (start & (mask << 1)))
5006 mask <<= 1; 5008 mask <<= 1;
5007 5009
5008 /* accumulate all internode masks */ 5010 /* accumulate all internode masks */
5009 accl_mask |= mask; 5011 accl_mask |= mask;
5010 } 5012 }
5011 5013
5012 /* convert mask to number of pages */ 5014 /* convert mask to number of pages */
5013 return ~accl_mask + 1; 5015 return ~accl_mask + 1;
5014 } 5016 }
5015 5017
5016 /* Find the lowest pfn for a node */ 5018 /* Find the lowest pfn for a node */
5017 static unsigned long __init find_min_pfn_for_node(int nid) 5019 static unsigned long __init find_min_pfn_for_node(int nid)
5018 { 5020 {
5019 unsigned long min_pfn = ULONG_MAX; 5021 unsigned long min_pfn = ULONG_MAX;
5020 unsigned long start_pfn; 5022 unsigned long start_pfn;
5021 int i; 5023 int i;
5022 5024
5023 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5025 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
5024 min_pfn = min(min_pfn, start_pfn); 5026 min_pfn = min(min_pfn, start_pfn);
5025 5027
5026 if (min_pfn == ULONG_MAX) { 5028 if (min_pfn == ULONG_MAX) {
5027 printk(KERN_WARNING 5029 printk(KERN_WARNING
5028 "Could not find start_pfn for node %d\n", nid); 5030 "Could not find start_pfn for node %d\n", nid);
5029 return 0; 5031 return 0;
5030 } 5032 }
5031 5033
5032 return min_pfn; 5034 return min_pfn;
5033 } 5035 }
5034 5036
5035 /** 5037 /**
5036 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5038 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5037 * 5039 *
5038 * It returns the minimum PFN based on information provided via 5040 * It returns the minimum PFN based on information provided via
5039 * add_active_range(). 5041 * add_active_range().
5040 */ 5042 */
5041 unsigned long __init find_min_pfn_with_active_regions(void) 5043 unsigned long __init find_min_pfn_with_active_regions(void)
5042 { 5044 {
5043 return find_min_pfn_for_node(MAX_NUMNODES); 5045 return find_min_pfn_for_node(MAX_NUMNODES);
5044 } 5046 }
5045 5047
5046 /* 5048 /*
5047 * early_calculate_totalpages() 5049 * early_calculate_totalpages()
5048 * Sum pages in active regions for movable zone. 5050 * Sum pages in active regions for movable zone.
5049 * Populate N_MEMORY for calculating usable_nodes. 5051 * Populate N_MEMORY for calculating usable_nodes.
5050 */ 5052 */
5051 static unsigned long __init early_calculate_totalpages(void) 5053 static unsigned long __init early_calculate_totalpages(void)
5052 { 5054 {
5053 unsigned long totalpages = 0; 5055 unsigned long totalpages = 0;
5054 unsigned long start_pfn, end_pfn; 5056 unsigned long start_pfn, end_pfn;
5055 int i, nid; 5057 int i, nid;
5056 5058
5057 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5059 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5058 unsigned long pages = end_pfn - start_pfn; 5060 unsigned long pages = end_pfn - start_pfn;
5059 5061
5060 totalpages += pages; 5062 totalpages += pages;
5061 if (pages) 5063 if (pages)
5062 node_set_state(nid, N_MEMORY); 5064 node_set_state(nid, N_MEMORY);
5063 } 5065 }
5064 return totalpages; 5066 return totalpages;
5065 } 5067 }
5066 5068
5067 /* 5069 /*
5068 * Find the PFN the Movable zone begins in each node. Kernel memory 5070 * Find the PFN the Movable zone begins in each node. Kernel memory
5069 * is spread evenly between nodes as long as the nodes have enough 5071 * is spread evenly between nodes as long as the nodes have enough
5070 * memory. When they don't, some nodes will have more kernelcore than 5072 * memory. When they don't, some nodes will have more kernelcore than
5071 * others 5073 * others
5072 */ 5074 */
5073 static void __init find_zone_movable_pfns_for_nodes(void) 5075 static void __init find_zone_movable_pfns_for_nodes(void)
5074 { 5076 {
5075 int i, nid; 5077 int i, nid;
5076 unsigned long usable_startpfn; 5078 unsigned long usable_startpfn;
5077 unsigned long kernelcore_node, kernelcore_remaining; 5079 unsigned long kernelcore_node, kernelcore_remaining;
5078 /* save the state before borrow the nodemask */ 5080 /* save the state before borrow the nodemask */
5079 nodemask_t saved_node_state = node_states[N_MEMORY]; 5081 nodemask_t saved_node_state = node_states[N_MEMORY];
5080 unsigned long totalpages = early_calculate_totalpages(); 5082 unsigned long totalpages = early_calculate_totalpages();
5081 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5083 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5082 5084
5083 /* 5085 /*
5084 * If movablecore was specified, calculate what size of 5086 * If movablecore was specified, calculate what size of
5085 * kernelcore that corresponds so that memory usable for 5087 * kernelcore that corresponds so that memory usable for
5086 * any allocation type is evenly spread. If both kernelcore 5088 * any allocation type is evenly spread. If both kernelcore
5087 * and movablecore are specified, then the value of kernelcore 5089 * and movablecore are specified, then the value of kernelcore
5088 * will be used for required_kernelcore if it's greater than 5090 * will be used for required_kernelcore if it's greater than
5089 * what movablecore would have allowed. 5091 * what movablecore would have allowed.
5090 */ 5092 */
5091 if (required_movablecore) { 5093 if (required_movablecore) {
5092 unsigned long corepages; 5094 unsigned long corepages;
5093 5095
5094 /* 5096 /*
5095 * Round-up so that ZONE_MOVABLE is at least as large as what 5097 * Round-up so that ZONE_MOVABLE is at least as large as what
5096 * was requested by the user 5098 * was requested by the user
5097 */ 5099 */
5098 required_movablecore = 5100 required_movablecore =
5099 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5101 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5100 corepages = totalpages - required_movablecore; 5102 corepages = totalpages - required_movablecore;
5101 5103
5102 required_kernelcore = max(required_kernelcore, corepages); 5104 required_kernelcore = max(required_kernelcore, corepages);
5103 } 5105 }
5104 5106
5105 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5107 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5106 if (!required_kernelcore) 5108 if (!required_kernelcore)
5107 goto out; 5109 goto out;
5108 5110
5109 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5111 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5110 find_usable_zone_for_movable(); 5112 find_usable_zone_for_movable();
5111 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5113 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5112 5114
5113 restart: 5115 restart:
5114 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5116 /* Spread kernelcore memory as evenly as possible throughout nodes */
5115 kernelcore_node = required_kernelcore / usable_nodes; 5117 kernelcore_node = required_kernelcore / usable_nodes;
5116 for_each_node_state(nid, N_MEMORY) { 5118 for_each_node_state(nid, N_MEMORY) {
5117 unsigned long start_pfn, end_pfn; 5119 unsigned long start_pfn, end_pfn;
5118 5120
5119 /* 5121 /*
5120 * Recalculate kernelcore_node if the division per node 5122 * Recalculate kernelcore_node if the division per node
5121 * now exceeds what is necessary to satisfy the requested 5123 * now exceeds what is necessary to satisfy the requested
5122 * amount of memory for the kernel 5124 * amount of memory for the kernel
5123 */ 5125 */
5124 if (required_kernelcore < kernelcore_node) 5126 if (required_kernelcore < kernelcore_node)
5125 kernelcore_node = required_kernelcore / usable_nodes; 5127 kernelcore_node = required_kernelcore / usable_nodes;
5126 5128
5127 /* 5129 /*
5128 * As the map is walked, we track how much memory is usable 5130 * As the map is walked, we track how much memory is usable
5129 * by the kernel using kernelcore_remaining. When it is 5131 * by the kernel using kernelcore_remaining. When it is
5130 * 0, the rest of the node is usable by ZONE_MOVABLE 5132 * 0, the rest of the node is usable by ZONE_MOVABLE
5131 */ 5133 */
5132 kernelcore_remaining = kernelcore_node; 5134 kernelcore_remaining = kernelcore_node;
5133 5135
5134 /* Go through each range of PFNs within this node */ 5136 /* Go through each range of PFNs within this node */
5135 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5137 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5136 unsigned long size_pages; 5138 unsigned long size_pages;
5137 5139
5138 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5140 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5139 if (start_pfn >= end_pfn) 5141 if (start_pfn >= end_pfn)
5140 continue; 5142 continue;
5141 5143
5142 /* Account for what is only usable for kernelcore */ 5144 /* Account for what is only usable for kernelcore */
5143 if (start_pfn < usable_startpfn) { 5145 if (start_pfn < usable_startpfn) {
5144 unsigned long kernel_pages; 5146 unsigned long kernel_pages;
5145 kernel_pages = min(end_pfn, usable_startpfn) 5147 kernel_pages = min(end_pfn, usable_startpfn)
5146 - start_pfn; 5148 - start_pfn;
5147 5149
5148 kernelcore_remaining -= min(kernel_pages, 5150 kernelcore_remaining -= min(kernel_pages,
5149 kernelcore_remaining); 5151 kernelcore_remaining);
5150 required_kernelcore -= min(kernel_pages, 5152 required_kernelcore -= min(kernel_pages,
5151 required_kernelcore); 5153 required_kernelcore);
5152 5154
5153 /* Continue if range is now fully accounted */ 5155 /* Continue if range is now fully accounted */
5154 if (end_pfn <= usable_startpfn) { 5156 if (end_pfn <= usable_startpfn) {
5155 5157
5156 /* 5158 /*
5157 * Push zone_movable_pfn to the end so 5159 * Push zone_movable_pfn to the end so
5158 * that if we have to rebalance 5160 * that if we have to rebalance
5159 * kernelcore across nodes, we will 5161 * kernelcore across nodes, we will
5160 * not double account here 5162 * not double account here
5161 */ 5163 */
5162 zone_movable_pfn[nid] = end_pfn; 5164 zone_movable_pfn[nid] = end_pfn;
5163 continue; 5165 continue;
5164 } 5166 }
5165 start_pfn = usable_startpfn; 5167 start_pfn = usable_startpfn;
5166 } 5168 }
5167 5169
5168 /* 5170 /*
5169 * The usable PFN range for ZONE_MOVABLE is from 5171 * The usable PFN range for ZONE_MOVABLE is from
5170 * start_pfn->end_pfn. Calculate size_pages as the 5172 * start_pfn->end_pfn. Calculate size_pages as the
5171 * number of pages used as kernelcore 5173 * number of pages used as kernelcore
5172 */ 5174 */
5173 size_pages = end_pfn - start_pfn; 5175 size_pages = end_pfn - start_pfn;
5174 if (size_pages > kernelcore_remaining) 5176 if (size_pages > kernelcore_remaining)
5175 size_pages = kernelcore_remaining; 5177 size_pages = kernelcore_remaining;
5176 zone_movable_pfn[nid] = start_pfn + size_pages; 5178 zone_movable_pfn[nid] = start_pfn + size_pages;
5177 5179
5178 /* 5180 /*
5179 * Some kernelcore has been met, update counts and 5181 * Some kernelcore has been met, update counts and
5180 * break if the kernelcore for this node has been 5182 * break if the kernelcore for this node has been
5181 * satisfied 5183 * satisfied
5182 */ 5184 */
5183 required_kernelcore -= min(required_kernelcore, 5185 required_kernelcore -= min(required_kernelcore,
5184 size_pages); 5186 size_pages);
5185 kernelcore_remaining -= size_pages; 5187 kernelcore_remaining -= size_pages;
5186 if (!kernelcore_remaining) 5188 if (!kernelcore_remaining)
5187 break; 5189 break;
5188 } 5190 }
5189 } 5191 }
5190 5192
5191 /* 5193 /*
5192 * If there is still required_kernelcore, we do another pass with one 5194 * If there is still required_kernelcore, we do another pass with one
5193 * less node in the count. This will push zone_movable_pfn[nid] further 5195 * less node in the count. This will push zone_movable_pfn[nid] further
5194 * along on the nodes that still have memory until kernelcore is 5196 * along on the nodes that still have memory until kernelcore is
5195 * satisfied 5197 * satisfied
5196 */ 5198 */
5197 usable_nodes--; 5199 usable_nodes--;
5198 if (usable_nodes && required_kernelcore > usable_nodes) 5200 if (usable_nodes && required_kernelcore > usable_nodes)
5199 goto restart; 5201 goto restart;
5200 5202
5201 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5203 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5202 for (nid = 0; nid < MAX_NUMNODES; nid++) 5204 for (nid = 0; nid < MAX_NUMNODES; nid++)
5203 zone_movable_pfn[nid] = 5205 zone_movable_pfn[nid] =
5204 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5206 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5205 5207
5206 out: 5208 out:
5207 /* restore the node_state */ 5209 /* restore the node_state */
5208 node_states[N_MEMORY] = saved_node_state; 5210 node_states[N_MEMORY] = saved_node_state;
5209 } 5211 }
5210 5212
5211 /* Any regular or high memory on that node ? */ 5213 /* Any regular or high memory on that node ? */
5212 static void check_for_memory(pg_data_t *pgdat, int nid) 5214 static void check_for_memory(pg_data_t *pgdat, int nid)
5213 { 5215 {
5214 enum zone_type zone_type; 5216 enum zone_type zone_type;
5215 5217
5216 if (N_MEMORY == N_NORMAL_MEMORY) 5218 if (N_MEMORY == N_NORMAL_MEMORY)
5217 return; 5219 return;
5218 5220
5219 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5221 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5220 struct zone *zone = &pgdat->node_zones[zone_type]; 5222 struct zone *zone = &pgdat->node_zones[zone_type];
5221 if (zone->present_pages) { 5223 if (zone->present_pages) {
5222 node_set_state(nid, N_HIGH_MEMORY); 5224 node_set_state(nid, N_HIGH_MEMORY);
5223 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5225 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5224 zone_type <= ZONE_NORMAL) 5226 zone_type <= ZONE_NORMAL)
5225 node_set_state(nid, N_NORMAL_MEMORY); 5227 node_set_state(nid, N_NORMAL_MEMORY);
5226 break; 5228 break;
5227 } 5229 }
5228 } 5230 }
5229 } 5231 }
5230 5232
5231 /** 5233 /**
5232 * free_area_init_nodes - Initialise all pg_data_t and zone data 5234 * free_area_init_nodes - Initialise all pg_data_t and zone data
5233 * @max_zone_pfn: an array of max PFNs for each zone 5235 * @max_zone_pfn: an array of max PFNs for each zone
5234 * 5236 *
5235 * This will call free_area_init_node() for each active node in the system. 5237 * This will call free_area_init_node() for each active node in the system.
5236 * Using the page ranges provided by add_active_range(), the size of each 5238 * Using the page ranges provided by add_active_range(), the size of each
5237 * zone in each node and their holes is calculated. If the maximum PFN 5239 * zone in each node and their holes is calculated. If the maximum PFN
5238 * between two adjacent zones match, it is assumed that the zone is empty. 5240 * between two adjacent zones match, it is assumed that the zone is empty.
5239 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5241 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5240 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5242 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5241 * starts where the previous one ended. For example, ZONE_DMA32 starts 5243 * starts where the previous one ended. For example, ZONE_DMA32 starts
5242 * at arch_max_dma_pfn. 5244 * at arch_max_dma_pfn.
5243 */ 5245 */
5244 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5246 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5245 { 5247 {
5246 unsigned long start_pfn, end_pfn; 5248 unsigned long start_pfn, end_pfn;
5247 int i, nid; 5249 int i, nid;
5248 5250
5249 /* Record where the zone boundaries are */ 5251 /* Record where the zone boundaries are */
5250 memset(arch_zone_lowest_possible_pfn, 0, 5252 memset(arch_zone_lowest_possible_pfn, 0,
5251 sizeof(arch_zone_lowest_possible_pfn)); 5253 sizeof(arch_zone_lowest_possible_pfn));
5252 memset(arch_zone_highest_possible_pfn, 0, 5254 memset(arch_zone_highest_possible_pfn, 0,
5253 sizeof(arch_zone_highest_possible_pfn)); 5255 sizeof(arch_zone_highest_possible_pfn));
5254 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5256 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5255 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5257 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5256 for (i = 1; i < MAX_NR_ZONES; i++) { 5258 for (i = 1; i < MAX_NR_ZONES; i++) {
5257 if (i == ZONE_MOVABLE) 5259 if (i == ZONE_MOVABLE)
5258 continue; 5260 continue;
5259 arch_zone_lowest_possible_pfn[i] = 5261 arch_zone_lowest_possible_pfn[i] =
5260 arch_zone_highest_possible_pfn[i-1]; 5262 arch_zone_highest_possible_pfn[i-1];
5261 arch_zone_highest_possible_pfn[i] = 5263 arch_zone_highest_possible_pfn[i] =
5262 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5264 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5263 } 5265 }
5264 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5266 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5265 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5267 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5266 5268
5267 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5269 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5268 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5270 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5269 find_zone_movable_pfns_for_nodes(); 5271 find_zone_movable_pfns_for_nodes();
5270 5272
5271 /* Print out the zone ranges */ 5273 /* Print out the zone ranges */
5272 printk("Zone ranges:\n"); 5274 printk("Zone ranges:\n");
5273 for (i = 0; i < MAX_NR_ZONES; i++) { 5275 for (i = 0; i < MAX_NR_ZONES; i++) {
5274 if (i == ZONE_MOVABLE) 5276 if (i == ZONE_MOVABLE)
5275 continue; 5277 continue;
5276 printk(KERN_CONT " %-8s ", zone_names[i]); 5278 printk(KERN_CONT " %-8s ", zone_names[i]);
5277 if (arch_zone_lowest_possible_pfn[i] == 5279 if (arch_zone_lowest_possible_pfn[i] ==
5278 arch_zone_highest_possible_pfn[i]) 5280 arch_zone_highest_possible_pfn[i])
5279 printk(KERN_CONT "empty\n"); 5281 printk(KERN_CONT "empty\n");
5280 else 5282 else
5281 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5283 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5282 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5284 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5283 (arch_zone_highest_possible_pfn[i] 5285 (arch_zone_highest_possible_pfn[i]
5284 << PAGE_SHIFT) - 1); 5286 << PAGE_SHIFT) - 1);
5285 } 5287 }
5286 5288
5287 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5289 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5288 printk("Movable zone start for each node\n"); 5290 printk("Movable zone start for each node\n");
5289 for (i = 0; i < MAX_NUMNODES; i++) { 5291 for (i = 0; i < MAX_NUMNODES; i++) {
5290 if (zone_movable_pfn[i]) 5292 if (zone_movable_pfn[i])
5291 printk(" Node %d: %#010lx\n", i, 5293 printk(" Node %d: %#010lx\n", i,
5292 zone_movable_pfn[i] << PAGE_SHIFT); 5294 zone_movable_pfn[i] << PAGE_SHIFT);
5293 } 5295 }
5294 5296
5295 /* Print out the early node map */ 5297 /* Print out the early node map */
5296 printk("Early memory node ranges\n"); 5298 printk("Early memory node ranges\n");
5297 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5299 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5298 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5300 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5299 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5301 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5300 5302
5301 /* Initialise every node */ 5303 /* Initialise every node */
5302 mminit_verify_pageflags_layout(); 5304 mminit_verify_pageflags_layout();
5303 setup_nr_node_ids(); 5305 setup_nr_node_ids();
5304 for_each_online_node(nid) { 5306 for_each_online_node(nid) {
5305 pg_data_t *pgdat = NODE_DATA(nid); 5307 pg_data_t *pgdat = NODE_DATA(nid);
5306 free_area_init_node(nid, NULL, 5308 free_area_init_node(nid, NULL,
5307 find_min_pfn_for_node(nid), NULL); 5309 find_min_pfn_for_node(nid), NULL);
5308 5310
5309 /* Any memory on that node */ 5311 /* Any memory on that node */
5310 if (pgdat->node_present_pages) 5312 if (pgdat->node_present_pages)
5311 node_set_state(nid, N_MEMORY); 5313 node_set_state(nid, N_MEMORY);
5312 check_for_memory(pgdat, nid); 5314 check_for_memory(pgdat, nid);
5313 } 5315 }
5314 } 5316 }
5315 5317
5316 static int __init cmdline_parse_core(char *p, unsigned long *core) 5318 static int __init cmdline_parse_core(char *p, unsigned long *core)
5317 { 5319 {
5318 unsigned long long coremem; 5320 unsigned long long coremem;
5319 if (!p) 5321 if (!p)
5320 return -EINVAL; 5322 return -EINVAL;
5321 5323
5322 coremem = memparse(p, &p); 5324 coremem = memparse(p, &p);
5323 *core = coremem >> PAGE_SHIFT; 5325 *core = coremem >> PAGE_SHIFT;
5324 5326
5325 /* Paranoid check that UL is enough for the coremem value */ 5327 /* Paranoid check that UL is enough for the coremem value */
5326 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5328 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5327 5329
5328 return 0; 5330 return 0;
5329 } 5331 }
5330 5332
5331 /* 5333 /*
5332 * kernelcore=size sets the amount of memory for use for allocations that 5334 * kernelcore=size sets the amount of memory for use for allocations that
5333 * cannot be reclaimed or migrated. 5335 * cannot be reclaimed or migrated.
5334 */ 5336 */
5335 static int __init cmdline_parse_kernelcore(char *p) 5337 static int __init cmdline_parse_kernelcore(char *p)
5336 { 5338 {
5337 return cmdline_parse_core(p, &required_kernelcore); 5339 return cmdline_parse_core(p, &required_kernelcore);
5338 } 5340 }
5339 5341
5340 /* 5342 /*
5341 * movablecore=size sets the amount of memory for use for allocations that 5343 * movablecore=size sets the amount of memory for use for allocations that
5342 * can be reclaimed or migrated. 5344 * can be reclaimed or migrated.
5343 */ 5345 */
5344 static int __init cmdline_parse_movablecore(char *p) 5346 static int __init cmdline_parse_movablecore(char *p)
5345 { 5347 {
5346 return cmdline_parse_core(p, &required_movablecore); 5348 return cmdline_parse_core(p, &required_movablecore);
5347 } 5349 }
5348 5350
5349 early_param("kernelcore", cmdline_parse_kernelcore); 5351 early_param("kernelcore", cmdline_parse_kernelcore);
5350 early_param("movablecore", cmdline_parse_movablecore); 5352 early_param("movablecore", cmdline_parse_movablecore);
5351 5353
5352 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5354 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5353 5355
5354 void adjust_managed_page_count(struct page *page, long count) 5356 void adjust_managed_page_count(struct page *page, long count)
5355 { 5357 {
5356 spin_lock(&managed_page_count_lock); 5358 spin_lock(&managed_page_count_lock);
5357 page_zone(page)->managed_pages += count; 5359 page_zone(page)->managed_pages += count;
5358 totalram_pages += count; 5360 totalram_pages += count;
5359 #ifdef CONFIG_HIGHMEM 5361 #ifdef CONFIG_HIGHMEM
5360 if (PageHighMem(page)) 5362 if (PageHighMem(page))
5361 totalhigh_pages += count; 5363 totalhigh_pages += count;
5362 #endif 5364 #endif
5363 spin_unlock(&managed_page_count_lock); 5365 spin_unlock(&managed_page_count_lock);
5364 } 5366 }
5365 EXPORT_SYMBOL(adjust_managed_page_count); 5367 EXPORT_SYMBOL(adjust_managed_page_count);
5366 5368
5367 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5369 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5368 { 5370 {
5369 void *pos; 5371 void *pos;
5370 unsigned long pages = 0; 5372 unsigned long pages = 0;
5371 5373
5372 start = (void *)PAGE_ALIGN((unsigned long)start); 5374 start = (void *)PAGE_ALIGN((unsigned long)start);
5373 end = (void *)((unsigned long)end & PAGE_MASK); 5375 end = (void *)((unsigned long)end & PAGE_MASK);
5374 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5376 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5375 if ((unsigned int)poison <= 0xFF) 5377 if ((unsigned int)poison <= 0xFF)
5376 memset(pos, poison, PAGE_SIZE); 5378 memset(pos, poison, PAGE_SIZE);
5377 free_reserved_page(virt_to_page(pos)); 5379 free_reserved_page(virt_to_page(pos));
5378 } 5380 }
5379 5381
5380 if (pages && s) 5382 if (pages && s)
5381 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5383 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5382 s, pages << (PAGE_SHIFT - 10), start, end); 5384 s, pages << (PAGE_SHIFT - 10), start, end);
5383 5385
5384 return pages; 5386 return pages;
5385 } 5387 }
5386 EXPORT_SYMBOL(free_reserved_area); 5388 EXPORT_SYMBOL(free_reserved_area);
5387 5389
5388 #ifdef CONFIG_HIGHMEM 5390 #ifdef CONFIG_HIGHMEM
5389 void free_highmem_page(struct page *page) 5391 void free_highmem_page(struct page *page)
5390 { 5392 {
5391 __free_reserved_page(page); 5393 __free_reserved_page(page);
5392 totalram_pages++; 5394 totalram_pages++;
5393 page_zone(page)->managed_pages++; 5395 page_zone(page)->managed_pages++;
5394 totalhigh_pages++; 5396 totalhigh_pages++;
5395 } 5397 }
5396 #endif 5398 #endif
5397 5399
5398 5400
5399 void __init mem_init_print_info(const char *str) 5401 void __init mem_init_print_info(const char *str)
5400 { 5402 {
5401 unsigned long physpages, codesize, datasize, rosize, bss_size; 5403 unsigned long physpages, codesize, datasize, rosize, bss_size;
5402 unsigned long init_code_size, init_data_size; 5404 unsigned long init_code_size, init_data_size;
5403 5405
5404 physpages = get_num_physpages(); 5406 physpages = get_num_physpages();
5405 codesize = _etext - _stext; 5407 codesize = _etext - _stext;
5406 datasize = _edata - _sdata; 5408 datasize = _edata - _sdata;
5407 rosize = __end_rodata - __start_rodata; 5409 rosize = __end_rodata - __start_rodata;
5408 bss_size = __bss_stop - __bss_start; 5410 bss_size = __bss_stop - __bss_start;
5409 init_data_size = __init_end - __init_begin; 5411 init_data_size = __init_end - __init_begin;
5410 init_code_size = _einittext - _sinittext; 5412 init_code_size = _einittext - _sinittext;
5411 5413
5412 /* 5414 /*
5413 * Detect special cases and adjust section sizes accordingly: 5415 * Detect special cases and adjust section sizes accordingly:
5414 * 1) .init.* may be embedded into .data sections 5416 * 1) .init.* may be embedded into .data sections
5415 * 2) .init.text.* may be out of [__init_begin, __init_end], 5417 * 2) .init.text.* may be out of [__init_begin, __init_end],
5416 * please refer to arch/tile/kernel/vmlinux.lds.S. 5418 * please refer to arch/tile/kernel/vmlinux.lds.S.
5417 * 3) .rodata.* may be embedded into .text or .data sections. 5419 * 3) .rodata.* may be embedded into .text or .data sections.
5418 */ 5420 */
5419 #define adj_init_size(start, end, size, pos, adj) \ 5421 #define adj_init_size(start, end, size, pos, adj) \
5420 do { \ 5422 do { \
5421 if (start <= pos && pos < end && size > adj) \ 5423 if (start <= pos && pos < end && size > adj) \
5422 size -= adj; \ 5424 size -= adj; \
5423 } while (0) 5425 } while (0)
5424 5426
5425 adj_init_size(__init_begin, __init_end, init_data_size, 5427 adj_init_size(__init_begin, __init_end, init_data_size,
5426 _sinittext, init_code_size); 5428 _sinittext, init_code_size);
5427 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5429 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5428 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5430 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5429 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5431 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5430 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5432 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5431 5433
5432 #undef adj_init_size 5434 #undef adj_init_size
5433 5435
5434 printk("Memory: %luK/%luK available " 5436 printk("Memory: %luK/%luK available "
5435 "(%luK kernel code, %luK rwdata, %luK rodata, " 5437 "(%luK kernel code, %luK rwdata, %luK rodata, "
5436 "%luK init, %luK bss, %luK reserved" 5438 "%luK init, %luK bss, %luK reserved"
5437 #ifdef CONFIG_HIGHMEM 5439 #ifdef CONFIG_HIGHMEM
5438 ", %luK highmem" 5440 ", %luK highmem"
5439 #endif 5441 #endif
5440 "%s%s)\n", 5442 "%s%s)\n",
5441 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5443 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5442 codesize >> 10, datasize >> 10, rosize >> 10, 5444 codesize >> 10, datasize >> 10, rosize >> 10,
5443 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5445 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5444 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5446 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5445 #ifdef CONFIG_HIGHMEM 5447 #ifdef CONFIG_HIGHMEM
5446 totalhigh_pages << (PAGE_SHIFT-10), 5448 totalhigh_pages << (PAGE_SHIFT-10),
5447 #endif 5449 #endif
5448 str ? ", " : "", str ? str : ""); 5450 str ? ", " : "", str ? str : "");
5449 } 5451 }
5450 5452
5451 /** 5453 /**
5452 * set_dma_reserve - set the specified number of pages reserved in the first zone 5454 * set_dma_reserve - set the specified number of pages reserved in the first zone
5453 * @new_dma_reserve: The number of pages to mark reserved 5455 * @new_dma_reserve: The number of pages to mark reserved
5454 * 5456 *
5455 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5457 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5456 * In the DMA zone, a significant percentage may be consumed by kernel image 5458 * In the DMA zone, a significant percentage may be consumed by kernel image
5457 * and other unfreeable allocations which can skew the watermarks badly. This 5459 * and other unfreeable allocations which can skew the watermarks badly. This
5458 * function may optionally be used to account for unfreeable pages in the 5460 * function may optionally be used to account for unfreeable pages in the
5459 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5461 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5460 * smaller per-cpu batchsize. 5462 * smaller per-cpu batchsize.
5461 */ 5463 */
5462 void __init set_dma_reserve(unsigned long new_dma_reserve) 5464 void __init set_dma_reserve(unsigned long new_dma_reserve)
5463 { 5465 {
5464 dma_reserve = new_dma_reserve; 5466 dma_reserve = new_dma_reserve;
5465 } 5467 }
5466 5468
5467 void __init free_area_init(unsigned long *zones_size) 5469 void __init free_area_init(unsigned long *zones_size)
5468 { 5470 {
5469 free_area_init_node(0, zones_size, 5471 free_area_init_node(0, zones_size,
5470 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5472 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5471 } 5473 }
5472 5474
5473 static int page_alloc_cpu_notify(struct notifier_block *self, 5475 static int page_alloc_cpu_notify(struct notifier_block *self,
5474 unsigned long action, void *hcpu) 5476 unsigned long action, void *hcpu)
5475 { 5477 {
5476 int cpu = (unsigned long)hcpu; 5478 int cpu = (unsigned long)hcpu;
5477 5479
5478 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5480 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5479 lru_add_drain_cpu(cpu); 5481 lru_add_drain_cpu(cpu);
5480 drain_pages(cpu); 5482 drain_pages(cpu);
5481 5483
5482 /* 5484 /*
5483 * Spill the event counters of the dead processor 5485 * Spill the event counters of the dead processor
5484 * into the current processors event counters. 5486 * into the current processors event counters.
5485 * This artificially elevates the count of the current 5487 * This artificially elevates the count of the current
5486 * processor. 5488 * processor.
5487 */ 5489 */
5488 vm_events_fold_cpu(cpu); 5490 vm_events_fold_cpu(cpu);
5489 5491
5490 /* 5492 /*
5491 * Zero the differential counters of the dead processor 5493 * Zero the differential counters of the dead processor
5492 * so that the vm statistics are consistent. 5494 * so that the vm statistics are consistent.
5493 * 5495 *
5494 * This is only okay since the processor is dead and cannot 5496 * This is only okay since the processor is dead and cannot
5495 * race with what we are doing. 5497 * race with what we are doing.
5496 */ 5498 */
5497 cpu_vm_stats_fold(cpu); 5499 cpu_vm_stats_fold(cpu);
5498 } 5500 }
5499 return NOTIFY_OK; 5501 return NOTIFY_OK;
5500 } 5502 }
5501 5503
5502 void __init page_alloc_init(void) 5504 void __init page_alloc_init(void)
5503 { 5505 {
5504 hotcpu_notifier(page_alloc_cpu_notify, 0); 5506 hotcpu_notifier(page_alloc_cpu_notify, 0);
5505 } 5507 }
5506 5508
5507 /* 5509 /*
5508 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5510 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5509 * or min_free_kbytes changes. 5511 * or min_free_kbytes changes.
5510 */ 5512 */
5511 static void calculate_totalreserve_pages(void) 5513 static void calculate_totalreserve_pages(void)
5512 { 5514 {
5513 struct pglist_data *pgdat; 5515 struct pglist_data *pgdat;
5514 unsigned long reserve_pages = 0; 5516 unsigned long reserve_pages = 0;
5515 enum zone_type i, j; 5517 enum zone_type i, j;
5516 5518
5517 for_each_online_pgdat(pgdat) { 5519 for_each_online_pgdat(pgdat) {
5518 for (i = 0; i < MAX_NR_ZONES; i++) { 5520 for (i = 0; i < MAX_NR_ZONES; i++) {
5519 struct zone *zone = pgdat->node_zones + i; 5521 struct zone *zone = pgdat->node_zones + i;
5520 unsigned long max = 0; 5522 unsigned long max = 0;
5521 5523
5522 /* Find valid and maximum lowmem_reserve in the zone */ 5524 /* Find valid and maximum lowmem_reserve in the zone */
5523 for (j = i; j < MAX_NR_ZONES; j++) { 5525 for (j = i; j < MAX_NR_ZONES; j++) {
5524 if (zone->lowmem_reserve[j] > max) 5526 if (zone->lowmem_reserve[j] > max)
5525 max = zone->lowmem_reserve[j]; 5527 max = zone->lowmem_reserve[j];
5526 } 5528 }
5527 5529
5528 /* we treat the high watermark as reserved pages. */ 5530 /* we treat the high watermark as reserved pages. */
5529 max += high_wmark_pages(zone); 5531 max += high_wmark_pages(zone);
5530 5532
5531 if (max > zone->managed_pages) 5533 if (max > zone->managed_pages)
5532 max = zone->managed_pages; 5534 max = zone->managed_pages;
5533 reserve_pages += max; 5535 reserve_pages += max;
5534 /* 5536 /*
5535 * Lowmem reserves are not available to 5537 * Lowmem reserves are not available to
5536 * GFP_HIGHUSER page cache allocations and 5538 * GFP_HIGHUSER page cache allocations and
5537 * kswapd tries to balance zones to their high 5539 * kswapd tries to balance zones to their high
5538 * watermark. As a result, neither should be 5540 * watermark. As a result, neither should be
5539 * regarded as dirtyable memory, to prevent a 5541 * regarded as dirtyable memory, to prevent a
5540 * situation where reclaim has to clean pages 5542 * situation where reclaim has to clean pages
5541 * in order to balance the zones. 5543 * in order to balance the zones.
5542 */ 5544 */
5543 zone->dirty_balance_reserve = max; 5545 zone->dirty_balance_reserve = max;
5544 } 5546 }
5545 } 5547 }
5546 dirty_balance_reserve = reserve_pages; 5548 dirty_balance_reserve = reserve_pages;
5547 totalreserve_pages = reserve_pages; 5549 totalreserve_pages = reserve_pages;
5548 } 5550 }
5549 5551
5550 /* 5552 /*
5551 * setup_per_zone_lowmem_reserve - called whenever 5553 * setup_per_zone_lowmem_reserve - called whenever
5552 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5554 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5553 * has a correct pages reserved value, so an adequate number of 5555 * has a correct pages reserved value, so an adequate number of
5554 * pages are left in the zone after a successful __alloc_pages(). 5556 * pages are left in the zone after a successful __alloc_pages().
5555 */ 5557 */
5556 static void setup_per_zone_lowmem_reserve(void) 5558 static void setup_per_zone_lowmem_reserve(void)
5557 { 5559 {
5558 struct pglist_data *pgdat; 5560 struct pglist_data *pgdat;
5559 enum zone_type j, idx; 5561 enum zone_type j, idx;
5560 5562
5561 for_each_online_pgdat(pgdat) { 5563 for_each_online_pgdat(pgdat) {
5562 for (j = 0; j < MAX_NR_ZONES; j++) { 5564 for (j = 0; j < MAX_NR_ZONES; j++) {
5563 struct zone *zone = pgdat->node_zones + j; 5565 struct zone *zone = pgdat->node_zones + j;
5564 unsigned long managed_pages = zone->managed_pages; 5566 unsigned long managed_pages = zone->managed_pages;
5565 5567
5566 zone->lowmem_reserve[j] = 0; 5568 zone->lowmem_reserve[j] = 0;
5567 5569
5568 idx = j; 5570 idx = j;
5569 while (idx) { 5571 while (idx) {
5570 struct zone *lower_zone; 5572 struct zone *lower_zone;
5571 5573
5572 idx--; 5574 idx--;
5573 5575
5574 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5576 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5575 sysctl_lowmem_reserve_ratio[idx] = 1; 5577 sysctl_lowmem_reserve_ratio[idx] = 1;
5576 5578
5577 lower_zone = pgdat->node_zones + idx; 5579 lower_zone = pgdat->node_zones + idx;
5578 lower_zone->lowmem_reserve[j] = managed_pages / 5580 lower_zone->lowmem_reserve[j] = managed_pages /
5579 sysctl_lowmem_reserve_ratio[idx]; 5581 sysctl_lowmem_reserve_ratio[idx];
5580 managed_pages += lower_zone->managed_pages; 5582 managed_pages += lower_zone->managed_pages;
5581 } 5583 }
5582 } 5584 }
5583 } 5585 }
5584 5586
5585 /* update totalreserve_pages */ 5587 /* update totalreserve_pages */
5586 calculate_totalreserve_pages(); 5588 calculate_totalreserve_pages();
5587 } 5589 }
5588 5590
5589 static void __setup_per_zone_wmarks(void) 5591 static void __setup_per_zone_wmarks(void)
5590 { 5592 {
5591 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5593 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5592 unsigned long lowmem_pages = 0; 5594 unsigned long lowmem_pages = 0;
5593 struct zone *zone; 5595 struct zone *zone;
5594 unsigned long flags; 5596 unsigned long flags;
5595 5597
5596 /* Calculate total number of !ZONE_HIGHMEM pages */ 5598 /* Calculate total number of !ZONE_HIGHMEM pages */
5597 for_each_zone(zone) { 5599 for_each_zone(zone) {
5598 if (!is_highmem(zone)) 5600 if (!is_highmem(zone))
5599 lowmem_pages += zone->managed_pages; 5601 lowmem_pages += zone->managed_pages;
5600 } 5602 }
5601 5603
5602 for_each_zone(zone) { 5604 for_each_zone(zone) {
5603 u64 tmp; 5605 u64 tmp;
5604 5606
5605 spin_lock_irqsave(&zone->lock, flags); 5607 spin_lock_irqsave(&zone->lock, flags);
5606 tmp = (u64)pages_min * zone->managed_pages; 5608 tmp = (u64)pages_min * zone->managed_pages;
5607 do_div(tmp, lowmem_pages); 5609 do_div(tmp, lowmem_pages);
5608 if (is_highmem(zone)) { 5610 if (is_highmem(zone)) {
5609 /* 5611 /*
5610 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5612 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5611 * need highmem pages, so cap pages_min to a small 5613 * need highmem pages, so cap pages_min to a small
5612 * value here. 5614 * value here.
5613 * 5615 *
5614 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5616 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5615 * deltas controls asynch page reclaim, and so should 5617 * deltas controls asynch page reclaim, and so should
5616 * not be capped for highmem. 5618 * not be capped for highmem.
5617 */ 5619 */
5618 unsigned long min_pages; 5620 unsigned long min_pages;
5619 5621
5620 min_pages = zone->managed_pages / 1024; 5622 min_pages = zone->managed_pages / 1024;
5621 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5623 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5622 zone->watermark[WMARK_MIN] = min_pages; 5624 zone->watermark[WMARK_MIN] = min_pages;
5623 } else { 5625 } else {
5624 /* 5626 /*
5625 * If it's a lowmem zone, reserve a number of pages 5627 * If it's a lowmem zone, reserve a number of pages
5626 * proportionate to the zone's size. 5628 * proportionate to the zone's size.
5627 */ 5629 */
5628 zone->watermark[WMARK_MIN] = tmp; 5630 zone->watermark[WMARK_MIN] = tmp;
5629 } 5631 }
5630 5632
5631 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5633 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5632 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5634 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5633 5635
5634 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5636 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5635 high_wmark_pages(zone) - 5637 high_wmark_pages(zone) -
5636 low_wmark_pages(zone) - 5638 low_wmark_pages(zone) -
5637 zone_page_state(zone, NR_ALLOC_BATCH)); 5639 zone_page_state(zone, NR_ALLOC_BATCH));
5638 5640
5639 setup_zone_migrate_reserve(zone); 5641 setup_zone_migrate_reserve(zone);
5640 spin_unlock_irqrestore(&zone->lock, flags); 5642 spin_unlock_irqrestore(&zone->lock, flags);
5641 } 5643 }
5642 5644
5643 /* update totalreserve_pages */ 5645 /* update totalreserve_pages */
5644 calculate_totalreserve_pages(); 5646 calculate_totalreserve_pages();
5645 } 5647 }
5646 5648
5647 /** 5649 /**
5648 * setup_per_zone_wmarks - called when min_free_kbytes changes 5650 * setup_per_zone_wmarks - called when min_free_kbytes changes
5649 * or when memory is hot-{added|removed} 5651 * or when memory is hot-{added|removed}
5650 * 5652 *
5651 * Ensures that the watermark[min,low,high] values for each zone are set 5653 * Ensures that the watermark[min,low,high] values for each zone are set
5652 * correctly with respect to min_free_kbytes. 5654 * correctly with respect to min_free_kbytes.
5653 */ 5655 */
5654 void setup_per_zone_wmarks(void) 5656 void setup_per_zone_wmarks(void)
5655 { 5657 {
5656 mutex_lock(&zonelists_mutex); 5658 mutex_lock(&zonelists_mutex);
5657 __setup_per_zone_wmarks(); 5659 __setup_per_zone_wmarks();
5658 mutex_unlock(&zonelists_mutex); 5660 mutex_unlock(&zonelists_mutex);
5659 } 5661 }
5660 5662
5661 /* 5663 /*
5662 * The inactive anon list should be small enough that the VM never has to 5664 * The inactive anon list should be small enough that the VM never has to
5663 * do too much work, but large enough that each inactive page has a chance 5665 * do too much work, but large enough that each inactive page has a chance
5664 * to be referenced again before it is swapped out. 5666 * to be referenced again before it is swapped out.
5665 * 5667 *
5666 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5668 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5667 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5669 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5668 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5670 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5669 * the anonymous pages are kept on the inactive list. 5671 * the anonymous pages are kept on the inactive list.
5670 * 5672 *
5671 * total target max 5673 * total target max
5672 * memory ratio inactive anon 5674 * memory ratio inactive anon
5673 * ------------------------------------- 5675 * -------------------------------------
5674 * 10MB 1 5MB 5676 * 10MB 1 5MB
5675 * 100MB 1 50MB 5677 * 100MB 1 50MB
5676 * 1GB 3 250MB 5678 * 1GB 3 250MB
5677 * 10GB 10 0.9GB 5679 * 10GB 10 0.9GB
5678 * 100GB 31 3GB 5680 * 100GB 31 3GB
5679 * 1TB 101 10GB 5681 * 1TB 101 10GB
5680 * 10TB 320 32GB 5682 * 10TB 320 32GB
5681 */ 5683 */
5682 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5684 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5683 { 5685 {
5684 unsigned int gb, ratio; 5686 unsigned int gb, ratio;
5685 5687
5686 /* Zone size in gigabytes */ 5688 /* Zone size in gigabytes */
5687 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5689 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5688 if (gb) 5690 if (gb)
5689 ratio = int_sqrt(10 * gb); 5691 ratio = int_sqrt(10 * gb);
5690 else 5692 else
5691 ratio = 1; 5693 ratio = 1;
5692 5694
5693 zone->inactive_ratio = ratio; 5695 zone->inactive_ratio = ratio;
5694 } 5696 }
5695 5697
5696 static void __meminit setup_per_zone_inactive_ratio(void) 5698 static void __meminit setup_per_zone_inactive_ratio(void)
5697 { 5699 {
5698 struct zone *zone; 5700 struct zone *zone;
5699 5701
5700 for_each_zone(zone) 5702 for_each_zone(zone)
5701 calculate_zone_inactive_ratio(zone); 5703 calculate_zone_inactive_ratio(zone);
5702 } 5704 }
5703 5705
5704 /* 5706 /*
5705 * Initialise min_free_kbytes. 5707 * Initialise min_free_kbytes.
5706 * 5708 *
5707 * For small machines we want it small (128k min). For large machines 5709 * For small machines we want it small (128k min). For large machines
5708 * we want it large (64MB max). But it is not linear, because network 5710 * we want it large (64MB max). But it is not linear, because network
5709 * bandwidth does not increase linearly with machine size. We use 5711 * bandwidth does not increase linearly with machine size. We use
5710 * 5712 *
5711 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5713 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5712 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5714 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5713 * 5715 *
5714 * which yields 5716 * which yields
5715 * 5717 *
5716 * 16MB: 512k 5718 * 16MB: 512k
5717 * 32MB: 724k 5719 * 32MB: 724k
5718 * 64MB: 1024k 5720 * 64MB: 1024k
5719 * 128MB: 1448k 5721 * 128MB: 1448k
5720 * 256MB: 2048k 5722 * 256MB: 2048k
5721 * 512MB: 2896k 5723 * 512MB: 2896k
5722 * 1024MB: 4096k 5724 * 1024MB: 4096k
5723 * 2048MB: 5792k 5725 * 2048MB: 5792k
5724 * 4096MB: 8192k 5726 * 4096MB: 8192k
5725 * 8192MB: 11584k 5727 * 8192MB: 11584k
5726 * 16384MB: 16384k 5728 * 16384MB: 16384k
5727 */ 5729 */
5728 int __meminit init_per_zone_wmark_min(void) 5730 int __meminit init_per_zone_wmark_min(void)
5729 { 5731 {
5730 unsigned long lowmem_kbytes; 5732 unsigned long lowmem_kbytes;
5731 int new_min_free_kbytes; 5733 int new_min_free_kbytes;
5732 5734
5733 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5735 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5734 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5736 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5735 5737
5736 if (new_min_free_kbytes > user_min_free_kbytes) { 5738 if (new_min_free_kbytes > user_min_free_kbytes) {
5737 min_free_kbytes = new_min_free_kbytes; 5739 min_free_kbytes = new_min_free_kbytes;
5738 if (min_free_kbytes < 128) 5740 if (min_free_kbytes < 128)
5739 min_free_kbytes = 128; 5741 min_free_kbytes = 128;
5740 if (min_free_kbytes > 65536) 5742 if (min_free_kbytes > 65536)
5741 min_free_kbytes = 65536; 5743 min_free_kbytes = 65536;
5742 } else { 5744 } else {
5743 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5745 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5744 new_min_free_kbytes, user_min_free_kbytes); 5746 new_min_free_kbytes, user_min_free_kbytes);
5745 } 5747 }
5746 setup_per_zone_wmarks(); 5748 setup_per_zone_wmarks();
5747 refresh_zone_stat_thresholds(); 5749 refresh_zone_stat_thresholds();
5748 setup_per_zone_lowmem_reserve(); 5750 setup_per_zone_lowmem_reserve();
5749 setup_per_zone_inactive_ratio(); 5751 setup_per_zone_inactive_ratio();
5750 return 0; 5752 return 0;
5751 } 5753 }
5752 module_init(init_per_zone_wmark_min) 5754 module_init(init_per_zone_wmark_min)
5753 5755
5754 /* 5756 /*
5755 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5757 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5756 * that we can call two helper functions whenever min_free_kbytes 5758 * that we can call two helper functions whenever min_free_kbytes
5757 * changes. 5759 * changes.
5758 */ 5760 */
5759 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5761 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5760 void __user *buffer, size_t *length, loff_t *ppos) 5762 void __user *buffer, size_t *length, loff_t *ppos)
5761 { 5763 {
5762 int rc; 5764 int rc;
5763 5765
5764 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5766 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5765 if (rc) 5767 if (rc)
5766 return rc; 5768 return rc;
5767 5769
5768 if (write) { 5770 if (write) {
5769 user_min_free_kbytes = min_free_kbytes; 5771 user_min_free_kbytes = min_free_kbytes;
5770 setup_per_zone_wmarks(); 5772 setup_per_zone_wmarks();
5771 } 5773 }
5772 return 0; 5774 return 0;
5773 } 5775 }
5774 5776
5775 #ifdef CONFIG_NUMA 5777 #ifdef CONFIG_NUMA
5776 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5778 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5777 void __user *buffer, size_t *length, loff_t *ppos) 5779 void __user *buffer, size_t *length, loff_t *ppos)
5778 { 5780 {
5779 struct zone *zone; 5781 struct zone *zone;
5780 int rc; 5782 int rc;
5781 5783
5782 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5784 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5783 if (rc) 5785 if (rc)
5784 return rc; 5786 return rc;
5785 5787
5786 for_each_zone(zone) 5788 for_each_zone(zone)
5787 zone->min_unmapped_pages = (zone->managed_pages * 5789 zone->min_unmapped_pages = (zone->managed_pages *
5788 sysctl_min_unmapped_ratio) / 100; 5790 sysctl_min_unmapped_ratio) / 100;
5789 return 0; 5791 return 0;
5790 } 5792 }
5791 5793
5792 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5794 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5793 void __user *buffer, size_t *length, loff_t *ppos) 5795 void __user *buffer, size_t *length, loff_t *ppos)
5794 { 5796 {
5795 struct zone *zone; 5797 struct zone *zone;
5796 int rc; 5798 int rc;
5797 5799
5798 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5800 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5799 if (rc) 5801 if (rc)
5800 return rc; 5802 return rc;
5801 5803
5802 for_each_zone(zone) 5804 for_each_zone(zone)
5803 zone->min_slab_pages = (zone->managed_pages * 5805 zone->min_slab_pages = (zone->managed_pages *
5804 sysctl_min_slab_ratio) / 100; 5806 sysctl_min_slab_ratio) / 100;
5805 return 0; 5807 return 0;
5806 } 5808 }
5807 #endif 5809 #endif
5808 5810
5809 /* 5811 /*
5810 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5812 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5811 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5813 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5812 * whenever sysctl_lowmem_reserve_ratio changes. 5814 * whenever sysctl_lowmem_reserve_ratio changes.
5813 * 5815 *
5814 * The reserve ratio obviously has absolutely no relation with the 5816 * The reserve ratio obviously has absolutely no relation with the
5815 * minimum watermarks. The lowmem reserve ratio can only make sense 5817 * minimum watermarks. The lowmem reserve ratio can only make sense
5816 * if in function of the boot time zone sizes. 5818 * if in function of the boot time zone sizes.
5817 */ 5819 */
5818 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5820 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5819 void __user *buffer, size_t *length, loff_t *ppos) 5821 void __user *buffer, size_t *length, loff_t *ppos)
5820 { 5822 {
5821 proc_dointvec_minmax(table, write, buffer, length, ppos); 5823 proc_dointvec_minmax(table, write, buffer, length, ppos);
5822 setup_per_zone_lowmem_reserve(); 5824 setup_per_zone_lowmem_reserve();
5823 return 0; 5825 return 0;
5824 } 5826 }
5825 5827
5826 /* 5828 /*
5827 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5829 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5828 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5830 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5829 * pagelist can have before it gets flushed back to buddy allocator. 5831 * pagelist can have before it gets flushed back to buddy allocator.
5830 */ 5832 */
5831 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5833 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5832 void __user *buffer, size_t *length, loff_t *ppos) 5834 void __user *buffer, size_t *length, loff_t *ppos)
5833 { 5835 {
5834 struct zone *zone; 5836 struct zone *zone;
5835 int old_percpu_pagelist_fraction; 5837 int old_percpu_pagelist_fraction;
5836 int ret; 5838 int ret;
5837 5839
5838 mutex_lock(&pcp_batch_high_lock); 5840 mutex_lock(&pcp_batch_high_lock);
5839 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 5841 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5840 5842
5841 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5843 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5842 if (!write || ret < 0) 5844 if (!write || ret < 0)
5843 goto out; 5845 goto out;
5844 5846
5845 /* Sanity checking to avoid pcp imbalance */ 5847 /* Sanity checking to avoid pcp imbalance */
5846 if (percpu_pagelist_fraction && 5848 if (percpu_pagelist_fraction &&
5847 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 5849 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5848 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 5850 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5849 ret = -EINVAL; 5851 ret = -EINVAL;
5850 goto out; 5852 goto out;
5851 } 5853 }
5852 5854
5853 /* No change? */ 5855 /* No change? */
5854 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 5856 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5855 goto out; 5857 goto out;
5856 5858
5857 for_each_populated_zone(zone) { 5859 for_each_populated_zone(zone) {
5858 unsigned int cpu; 5860 unsigned int cpu;
5859 5861
5860 for_each_possible_cpu(cpu) 5862 for_each_possible_cpu(cpu)
5861 pageset_set_high_and_batch(zone, 5863 pageset_set_high_and_batch(zone,
5862 per_cpu_ptr(zone->pageset, cpu)); 5864 per_cpu_ptr(zone->pageset, cpu));
5863 } 5865 }
5864 out: 5866 out:
5865 mutex_unlock(&pcp_batch_high_lock); 5867 mutex_unlock(&pcp_batch_high_lock);
5866 return ret; 5868 return ret;
5867 } 5869 }
5868 5870
5869 int hashdist = HASHDIST_DEFAULT; 5871 int hashdist = HASHDIST_DEFAULT;
5870 5872
5871 #ifdef CONFIG_NUMA 5873 #ifdef CONFIG_NUMA
5872 static int __init set_hashdist(char *str) 5874 static int __init set_hashdist(char *str)
5873 { 5875 {
5874 if (!str) 5876 if (!str)
5875 return 0; 5877 return 0;
5876 hashdist = simple_strtoul(str, &str, 0); 5878 hashdist = simple_strtoul(str, &str, 0);
5877 return 1; 5879 return 1;
5878 } 5880 }
5879 __setup("hashdist=", set_hashdist); 5881 __setup("hashdist=", set_hashdist);
5880 #endif 5882 #endif
5881 5883
5882 /* 5884 /*
5883 * allocate a large system hash table from bootmem 5885 * allocate a large system hash table from bootmem
5884 * - it is assumed that the hash table must contain an exact power-of-2 5886 * - it is assumed that the hash table must contain an exact power-of-2
5885 * quantity of entries 5887 * quantity of entries
5886 * - limit is the number of hash buckets, not the total allocation size 5888 * - limit is the number of hash buckets, not the total allocation size
5887 */ 5889 */
5888 void *__init alloc_large_system_hash(const char *tablename, 5890 void *__init alloc_large_system_hash(const char *tablename,
5889 unsigned long bucketsize, 5891 unsigned long bucketsize,
5890 unsigned long numentries, 5892 unsigned long numentries,
5891 int scale, 5893 int scale,
5892 int flags, 5894 int flags,
5893 unsigned int *_hash_shift, 5895 unsigned int *_hash_shift,
5894 unsigned int *_hash_mask, 5896 unsigned int *_hash_mask,
5895 unsigned long low_limit, 5897 unsigned long low_limit,
5896 unsigned long high_limit) 5898 unsigned long high_limit)
5897 { 5899 {
5898 unsigned long long max = high_limit; 5900 unsigned long long max = high_limit;
5899 unsigned long log2qty, size; 5901 unsigned long log2qty, size;
5900 void *table = NULL; 5902 void *table = NULL;
5901 5903
5902 /* allow the kernel cmdline to have a say */ 5904 /* allow the kernel cmdline to have a say */
5903 if (!numentries) { 5905 if (!numentries) {
5904 /* round applicable memory size up to nearest megabyte */ 5906 /* round applicable memory size up to nearest megabyte */
5905 numentries = nr_kernel_pages; 5907 numentries = nr_kernel_pages;
5906 5908
5907 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5909 /* It isn't necessary when PAGE_SIZE >= 1MB */
5908 if (PAGE_SHIFT < 20) 5910 if (PAGE_SHIFT < 20)
5909 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5911 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5910 5912
5911 /* limit to 1 bucket per 2^scale bytes of low memory */ 5913 /* limit to 1 bucket per 2^scale bytes of low memory */
5912 if (scale > PAGE_SHIFT) 5914 if (scale > PAGE_SHIFT)
5913 numentries >>= (scale - PAGE_SHIFT); 5915 numentries >>= (scale - PAGE_SHIFT);
5914 else 5916 else
5915 numentries <<= (PAGE_SHIFT - scale); 5917 numentries <<= (PAGE_SHIFT - scale);
5916 5918
5917 /* Make sure we've got at least a 0-order allocation.. */ 5919 /* Make sure we've got at least a 0-order allocation.. */
5918 if (unlikely(flags & HASH_SMALL)) { 5920 if (unlikely(flags & HASH_SMALL)) {
5919 /* Makes no sense without HASH_EARLY */ 5921 /* Makes no sense without HASH_EARLY */
5920 WARN_ON(!(flags & HASH_EARLY)); 5922 WARN_ON(!(flags & HASH_EARLY));
5921 if (!(numentries >> *_hash_shift)) { 5923 if (!(numentries >> *_hash_shift)) {
5922 numentries = 1UL << *_hash_shift; 5924 numentries = 1UL << *_hash_shift;
5923 BUG_ON(!numentries); 5925 BUG_ON(!numentries);
5924 } 5926 }
5925 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5927 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5926 numentries = PAGE_SIZE / bucketsize; 5928 numentries = PAGE_SIZE / bucketsize;
5927 } 5929 }
5928 numentries = roundup_pow_of_two(numentries); 5930 numentries = roundup_pow_of_two(numentries);
5929 5931
5930 /* limit allocation size to 1/16 total memory by default */ 5932 /* limit allocation size to 1/16 total memory by default */
5931 if (max == 0) { 5933 if (max == 0) {
5932 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5934 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5933 do_div(max, bucketsize); 5935 do_div(max, bucketsize);
5934 } 5936 }
5935 max = min(max, 0x80000000ULL); 5937 max = min(max, 0x80000000ULL);
5936 5938
5937 if (numentries < low_limit) 5939 if (numentries < low_limit)
5938 numentries = low_limit; 5940 numentries = low_limit;
5939 if (numentries > max) 5941 if (numentries > max)
5940 numentries = max; 5942 numentries = max;
5941 5943
5942 log2qty = ilog2(numentries); 5944 log2qty = ilog2(numentries);
5943 5945
5944 do { 5946 do {
5945 size = bucketsize << log2qty; 5947 size = bucketsize << log2qty;
5946 if (flags & HASH_EARLY) 5948 if (flags & HASH_EARLY)
5947 table = alloc_bootmem_nopanic(size); 5949 table = alloc_bootmem_nopanic(size);
5948 else if (hashdist) 5950 else if (hashdist)
5949 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5951 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5950 else { 5952 else {
5951 /* 5953 /*
5952 * If bucketsize is not a power-of-two, we may free 5954 * If bucketsize is not a power-of-two, we may free
5953 * some pages at the end of hash table which 5955 * some pages at the end of hash table which
5954 * alloc_pages_exact() automatically does 5956 * alloc_pages_exact() automatically does
5955 */ 5957 */
5956 if (get_order(size) < MAX_ORDER) { 5958 if (get_order(size) < MAX_ORDER) {
5957 table = alloc_pages_exact(size, GFP_ATOMIC); 5959 table = alloc_pages_exact(size, GFP_ATOMIC);
5958 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5960 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5959 } 5961 }
5960 } 5962 }
5961 } while (!table && size > PAGE_SIZE && --log2qty); 5963 } while (!table && size > PAGE_SIZE && --log2qty);
5962 5964
5963 if (!table) 5965 if (!table)
5964 panic("Failed to allocate %s hash table\n", tablename); 5966 panic("Failed to allocate %s hash table\n", tablename);
5965 5967
5966 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5968 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5967 tablename, 5969 tablename,
5968 (1UL << log2qty), 5970 (1UL << log2qty),
5969 ilog2(size) - PAGE_SHIFT, 5971 ilog2(size) - PAGE_SHIFT,
5970 size); 5972 size);
5971 5973
5972 if (_hash_shift) 5974 if (_hash_shift)
5973 *_hash_shift = log2qty; 5975 *_hash_shift = log2qty;
5974 if (_hash_mask) 5976 if (_hash_mask)
5975 *_hash_mask = (1 << log2qty) - 1; 5977 *_hash_mask = (1 << log2qty) - 1;
5976 5978
5977 return table; 5979 return table;
5978 } 5980 }
5979 5981
5980 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5982 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5981 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5983 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5982 unsigned long pfn) 5984 unsigned long pfn)
5983 { 5985 {
5984 #ifdef CONFIG_SPARSEMEM 5986 #ifdef CONFIG_SPARSEMEM
5985 return __pfn_to_section(pfn)->pageblock_flags; 5987 return __pfn_to_section(pfn)->pageblock_flags;
5986 #else 5988 #else
5987 return zone->pageblock_flags; 5989 return zone->pageblock_flags;
5988 #endif /* CONFIG_SPARSEMEM */ 5990 #endif /* CONFIG_SPARSEMEM */
5989 } 5991 }
5990 5992
5991 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5993 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5992 { 5994 {
5993 #ifdef CONFIG_SPARSEMEM 5995 #ifdef CONFIG_SPARSEMEM
5994 pfn &= (PAGES_PER_SECTION-1); 5996 pfn &= (PAGES_PER_SECTION-1);
5995 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5997 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5996 #else 5998 #else
5997 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 5999 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5998 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 6000 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5999 #endif /* CONFIG_SPARSEMEM */ 6001 #endif /* CONFIG_SPARSEMEM */
6000 } 6002 }
6001 6003
6002 /** 6004 /**
6003 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 6005 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
6004 * @page: The page within the block of interest 6006 * @page: The page within the block of interest
6005 * @start_bitidx: The first bit of interest to retrieve 6007 * @start_bitidx: The first bit of interest to retrieve
6006 * @end_bitidx: The last bit of interest 6008 * @end_bitidx: The last bit of interest
6007 * returns pageblock_bits flags 6009 * returns pageblock_bits flags
6008 */ 6010 */
6009 unsigned long get_pageblock_flags_mask(struct page *page, 6011 unsigned long get_pageblock_flags_mask(struct page *page,
6010 unsigned long end_bitidx, 6012 unsigned long end_bitidx,
6011 unsigned long mask) 6013 unsigned long mask)
6012 { 6014 {
6013 struct zone *zone; 6015 struct zone *zone;
6014 unsigned long *bitmap; 6016 unsigned long *bitmap;
6015 unsigned long pfn, bitidx, word_bitidx; 6017 unsigned long pfn, bitidx, word_bitidx;
6016 unsigned long word; 6018 unsigned long word;
6017 6019
6018 zone = page_zone(page); 6020 zone = page_zone(page);
6019 pfn = page_to_pfn(page); 6021 pfn = page_to_pfn(page);
6020 bitmap = get_pageblock_bitmap(zone, pfn); 6022 bitmap = get_pageblock_bitmap(zone, pfn);
6021 bitidx = pfn_to_bitidx(zone, pfn); 6023 bitidx = pfn_to_bitidx(zone, pfn);
6022 word_bitidx = bitidx / BITS_PER_LONG; 6024 word_bitidx = bitidx / BITS_PER_LONG;
6023 bitidx &= (BITS_PER_LONG-1); 6025 bitidx &= (BITS_PER_LONG-1);
6024 6026
6025 word = bitmap[word_bitidx]; 6027 word = bitmap[word_bitidx];
6026 bitidx += end_bitidx; 6028 bitidx += end_bitidx;
6027 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6029 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6028 } 6030 }
6029 6031
6030 /** 6032 /**
6031 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6033 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6032 * @page: The page within the block of interest 6034 * @page: The page within the block of interest
6033 * @start_bitidx: The first bit of interest 6035 * @start_bitidx: The first bit of interest
6034 * @end_bitidx: The last bit of interest 6036 * @end_bitidx: The last bit of interest
6035 * @flags: The flags to set 6037 * @flags: The flags to set
6036 */ 6038 */
6037 void set_pageblock_flags_mask(struct page *page, unsigned long flags, 6039 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
6038 unsigned long end_bitidx, 6040 unsigned long end_bitidx,
6039 unsigned long mask) 6041 unsigned long mask)
6040 { 6042 {
6041 struct zone *zone; 6043 struct zone *zone;
6042 unsigned long *bitmap; 6044 unsigned long *bitmap;
6043 unsigned long pfn, bitidx, word_bitidx; 6045 unsigned long pfn, bitidx, word_bitidx;
6044 unsigned long old_word, word; 6046 unsigned long old_word, word;
6045 6047
6046 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6048 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6047 6049
6048 zone = page_zone(page); 6050 zone = page_zone(page);
6049 pfn = page_to_pfn(page); 6051 pfn = page_to_pfn(page);
6050 bitmap = get_pageblock_bitmap(zone, pfn); 6052 bitmap = get_pageblock_bitmap(zone, pfn);
6051 bitidx = pfn_to_bitidx(zone, pfn); 6053 bitidx = pfn_to_bitidx(zone, pfn);
6052 word_bitidx = bitidx / BITS_PER_LONG; 6054 word_bitidx = bitidx / BITS_PER_LONG;
6053 bitidx &= (BITS_PER_LONG-1); 6055 bitidx &= (BITS_PER_LONG-1);
6054 6056
6055 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6057 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
6056 6058
6057 bitidx += end_bitidx; 6059 bitidx += end_bitidx;
6058 mask <<= (BITS_PER_LONG - bitidx - 1); 6060 mask <<= (BITS_PER_LONG - bitidx - 1);
6059 flags <<= (BITS_PER_LONG - bitidx - 1); 6061 flags <<= (BITS_PER_LONG - bitidx - 1);
6060 6062
6061 word = ACCESS_ONCE(bitmap[word_bitidx]); 6063 word = ACCESS_ONCE(bitmap[word_bitidx]);
6062 for (;;) { 6064 for (;;) {
6063 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6065 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6064 if (word == old_word) 6066 if (word == old_word)
6065 break; 6067 break;
6066 word = old_word; 6068 word = old_word;
6067 } 6069 }
6068 } 6070 }
6069 6071
6070 /* 6072 /*
6071 * This function checks whether pageblock includes unmovable pages or not. 6073 * This function checks whether pageblock includes unmovable pages or not.
6072 * If @count is not zero, it is okay to include less @count unmovable pages 6074 * If @count is not zero, it is okay to include less @count unmovable pages
6073 * 6075 *
6074 * PageLRU check without isolation or lru_lock could race so that 6076 * PageLRU check without isolation or lru_lock could race so that
6075 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6077 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6076 * expect this function should be exact. 6078 * expect this function should be exact.
6077 */ 6079 */
6078 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6080 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6079 bool skip_hwpoisoned_pages) 6081 bool skip_hwpoisoned_pages)
6080 { 6082 {
6081 unsigned long pfn, iter, found; 6083 unsigned long pfn, iter, found;
6082 int mt; 6084 int mt;
6083 6085
6084 /* 6086 /*
6085 * For avoiding noise data, lru_add_drain_all() should be called 6087 * For avoiding noise data, lru_add_drain_all() should be called
6086 * If ZONE_MOVABLE, the zone never contains unmovable pages 6088 * If ZONE_MOVABLE, the zone never contains unmovable pages
6087 */ 6089 */
6088 if (zone_idx(zone) == ZONE_MOVABLE) 6090 if (zone_idx(zone) == ZONE_MOVABLE)
6089 return false; 6091 return false;
6090 mt = get_pageblock_migratetype(page); 6092 mt = get_pageblock_migratetype(page);
6091 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6093 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
6092 return false; 6094 return false;
6093 6095
6094 pfn = page_to_pfn(page); 6096 pfn = page_to_pfn(page);
6095 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6097 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6096 unsigned long check = pfn + iter; 6098 unsigned long check = pfn + iter;
6097 6099
6098 if (!pfn_valid_within(check)) 6100 if (!pfn_valid_within(check))
6099 continue; 6101 continue;
6100 6102
6101 page = pfn_to_page(check); 6103 page = pfn_to_page(check);
6102 6104
6103 /* 6105 /*
6104 * Hugepages are not in LRU lists, but they're movable. 6106 * Hugepages are not in LRU lists, but they're movable.
6105 * We need not scan over tail pages bacause we don't 6107 * We need not scan over tail pages bacause we don't
6106 * handle each tail page individually in migration. 6108 * handle each tail page individually in migration.
6107 */ 6109 */
6108 if (PageHuge(page)) { 6110 if (PageHuge(page)) {
6109 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6111 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6110 continue; 6112 continue;
6111 } 6113 }
6112 6114
6113 /* 6115 /*
6114 * We can't use page_count without pin a page 6116 * We can't use page_count without pin a page
6115 * because another CPU can free compound page. 6117 * because another CPU can free compound page.
6116 * This check already skips compound tails of THP 6118 * This check already skips compound tails of THP
6117 * because their page->_count is zero at all time. 6119 * because their page->_count is zero at all time.
6118 */ 6120 */
6119 if (!atomic_read(&page->_count)) { 6121 if (!atomic_read(&page->_count)) {
6120 if (PageBuddy(page)) 6122 if (PageBuddy(page))
6121 iter += (1 << page_order(page)) - 1; 6123 iter += (1 << page_order(page)) - 1;
6122 continue; 6124 continue;
6123 } 6125 }
6124 6126
6125 /* 6127 /*
6126 * The HWPoisoned page may be not in buddy system, and 6128 * The HWPoisoned page may be not in buddy system, and
6127 * page_count() is not 0. 6129 * page_count() is not 0.
6128 */ 6130 */
6129 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6131 if (skip_hwpoisoned_pages && PageHWPoison(page))
6130 continue; 6132 continue;
6131 6133
6132 if (!PageLRU(page)) 6134 if (!PageLRU(page))
6133 found++; 6135 found++;
6134 /* 6136 /*
6135 * If there are RECLAIMABLE pages, we need to check it. 6137 * If there are RECLAIMABLE pages, we need to check it.
6136 * But now, memory offline itself doesn't call shrink_slab() 6138 * But now, memory offline itself doesn't call shrink_slab()
6137 * and it still to be fixed. 6139 * and it still to be fixed.
6138 */ 6140 */
6139 /* 6141 /*
6140 * If the page is not RAM, page_count()should be 0. 6142 * If the page is not RAM, page_count()should be 0.
6141 * we don't need more check. This is an _used_ not-movable page. 6143 * we don't need more check. This is an _used_ not-movable page.
6142 * 6144 *
6143 * The problematic thing here is PG_reserved pages. PG_reserved 6145 * The problematic thing here is PG_reserved pages. PG_reserved
6144 * is set to both of a memory hole page and a _used_ kernel 6146 * is set to both of a memory hole page and a _used_ kernel
6145 * page at boot. 6147 * page at boot.
6146 */ 6148 */
6147 if (found > count) 6149 if (found > count)
6148 return true; 6150 return true;
6149 } 6151 }
6150 return false; 6152 return false;
6151 } 6153 }
6152 6154
6153 bool is_pageblock_removable_nolock(struct page *page) 6155 bool is_pageblock_removable_nolock(struct page *page)
6154 { 6156 {
6155 struct zone *zone; 6157 struct zone *zone;
6156 unsigned long pfn; 6158 unsigned long pfn;
6157 6159
6158 /* 6160 /*
6159 * We have to be careful here because we are iterating over memory 6161 * We have to be careful here because we are iterating over memory
6160 * sections which are not zone aware so we might end up outside of 6162 * sections which are not zone aware so we might end up outside of
6161 * the zone but still within the section. 6163 * the zone but still within the section.
6162 * We have to take care about the node as well. If the node is offline 6164 * We have to take care about the node as well. If the node is offline
6163 * its NODE_DATA will be NULL - see page_zone. 6165 * its NODE_DATA will be NULL - see page_zone.
6164 */ 6166 */
6165 if (!node_online(page_to_nid(page))) 6167 if (!node_online(page_to_nid(page)))
6166 return false; 6168 return false;
6167 6169
6168 zone = page_zone(page); 6170 zone = page_zone(page);
6169 pfn = page_to_pfn(page); 6171 pfn = page_to_pfn(page);
6170 if (!zone_spans_pfn(zone, pfn)) 6172 if (!zone_spans_pfn(zone, pfn))
6171 return false; 6173 return false;
6172 6174
6173 return !has_unmovable_pages(zone, page, 0, true); 6175 return !has_unmovable_pages(zone, page, 0, true);
6174 } 6176 }
6175 6177
6176 #ifdef CONFIG_CMA 6178 #ifdef CONFIG_CMA
6177 6179
6178 static unsigned long pfn_max_align_down(unsigned long pfn) 6180 static unsigned long pfn_max_align_down(unsigned long pfn)
6179 { 6181 {
6180 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6182 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6181 pageblock_nr_pages) - 1); 6183 pageblock_nr_pages) - 1);
6182 } 6184 }
6183 6185
6184 static unsigned long pfn_max_align_up(unsigned long pfn) 6186 static unsigned long pfn_max_align_up(unsigned long pfn)
6185 { 6187 {
6186 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6188 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6187 pageblock_nr_pages)); 6189 pageblock_nr_pages));
6188 } 6190 }
6189 6191
6190 /* [start, end) must belong to a single zone. */ 6192 /* [start, end) must belong to a single zone. */
6191 static int __alloc_contig_migrate_range(struct compact_control *cc, 6193 static int __alloc_contig_migrate_range(struct compact_control *cc,
6192 unsigned long start, unsigned long end) 6194 unsigned long start, unsigned long end)
6193 { 6195 {
6194 /* This function is based on compact_zone() from compaction.c. */ 6196 /* This function is based on compact_zone() from compaction.c. */
6195 unsigned long nr_reclaimed; 6197 unsigned long nr_reclaimed;
6196 unsigned long pfn = start; 6198 unsigned long pfn = start;
6197 unsigned int tries = 0; 6199 unsigned int tries = 0;
6198 int ret = 0; 6200 int ret = 0;
6199 6201
6200 migrate_prep(); 6202 migrate_prep();
6201 6203
6202 while (pfn < end || !list_empty(&cc->migratepages)) { 6204 while (pfn < end || !list_empty(&cc->migratepages)) {
6203 if (fatal_signal_pending(current)) { 6205 if (fatal_signal_pending(current)) {
6204 ret = -EINTR; 6206 ret = -EINTR;
6205 break; 6207 break;
6206 } 6208 }
6207 6209
6208 if (list_empty(&cc->migratepages)) { 6210 if (list_empty(&cc->migratepages)) {
6209 cc->nr_migratepages = 0; 6211 cc->nr_migratepages = 0;
6210 pfn = isolate_migratepages_range(cc->zone, cc, 6212 pfn = isolate_migratepages_range(cc->zone, cc,
6211 pfn, end, true); 6213 pfn, end, true);
6212 if (!pfn) { 6214 if (!pfn) {
6213 ret = -EINTR; 6215 ret = -EINTR;
6214 break; 6216 break;
6215 } 6217 }
6216 tries = 0; 6218 tries = 0;
6217 } else if (++tries == 5) { 6219 } else if (++tries == 5) {
6218 ret = ret < 0 ? ret : -EBUSY; 6220 ret = ret < 0 ? ret : -EBUSY;
6219 break; 6221 break;
6220 } 6222 }
6221 6223
6222 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6224 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6223 &cc->migratepages); 6225 &cc->migratepages);
6224 cc->nr_migratepages -= nr_reclaimed; 6226 cc->nr_migratepages -= nr_reclaimed;
6225 6227
6226 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6228 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6227 NULL, 0, cc->mode, MR_CMA); 6229 NULL, 0, cc->mode, MR_CMA);
6228 } 6230 }
6229 if (ret < 0) { 6231 if (ret < 0) {
6230 putback_movable_pages(&cc->migratepages); 6232 putback_movable_pages(&cc->migratepages);
6231 return ret; 6233 return ret;
6232 } 6234 }
6233 return 0; 6235 return 0;
6234 } 6236 }
6235 6237
6236 /** 6238 /**
6237 * alloc_contig_range() -- tries to allocate given range of pages 6239 * alloc_contig_range() -- tries to allocate given range of pages
6238 * @start: start PFN to allocate 6240 * @start: start PFN to allocate
6239 * @end: one-past-the-last PFN to allocate 6241 * @end: one-past-the-last PFN to allocate
6240 * @migratetype: migratetype of the underlaying pageblocks (either 6242 * @migratetype: migratetype of the underlaying pageblocks (either
6241 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6243 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6242 * in range must have the same migratetype and it must 6244 * in range must have the same migratetype and it must
6243 * be either of the two. 6245 * be either of the two.
6244 * 6246 *
6245 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6247 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6246 * aligned, however it's the caller's responsibility to guarantee that 6248 * aligned, however it's the caller's responsibility to guarantee that
6247 * we are the only thread that changes migrate type of pageblocks the 6249 * we are the only thread that changes migrate type of pageblocks the
6248 * pages fall in. 6250 * pages fall in.
6249 * 6251 *
6250 * The PFN range must belong to a single zone. 6252 * The PFN range must belong to a single zone.
6251 * 6253 *
6252 * Returns zero on success or negative error code. On success all 6254 * Returns zero on success or negative error code. On success all
6253 * pages which PFN is in [start, end) are allocated for the caller and 6255 * pages which PFN is in [start, end) are allocated for the caller and
6254 * need to be freed with free_contig_range(). 6256 * need to be freed with free_contig_range().
6255 */ 6257 */
6256 int alloc_contig_range(unsigned long start, unsigned long end, 6258 int alloc_contig_range(unsigned long start, unsigned long end,
6257 unsigned migratetype) 6259 unsigned migratetype)
6258 { 6260 {
6259 unsigned long outer_start, outer_end; 6261 unsigned long outer_start, outer_end;
6260 int ret = 0, order; 6262 int ret = 0, order;
6261 6263
6262 struct compact_control cc = { 6264 struct compact_control cc = {
6263 .nr_migratepages = 0, 6265 .nr_migratepages = 0,
6264 .order = -1, 6266 .order = -1,
6265 .zone = page_zone(pfn_to_page(start)), 6267 .zone = page_zone(pfn_to_page(start)),
6266 .mode = MIGRATE_SYNC, 6268 .mode = MIGRATE_SYNC,
6267 .ignore_skip_hint = true, 6269 .ignore_skip_hint = true,
6268 }; 6270 };
6269 INIT_LIST_HEAD(&cc.migratepages); 6271 INIT_LIST_HEAD(&cc.migratepages);
6270 6272
6271 /* 6273 /*
6272 * What we do here is we mark all pageblocks in range as 6274 * What we do here is we mark all pageblocks in range as
6273 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6275 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6274 * have different sizes, and due to the way page allocator 6276 * have different sizes, and due to the way page allocator
6275 * work, we align the range to biggest of the two pages so 6277 * work, we align the range to biggest of the two pages so
6276 * that page allocator won't try to merge buddies from 6278 * that page allocator won't try to merge buddies from
6277 * different pageblocks and change MIGRATE_ISOLATE to some 6279 * different pageblocks and change MIGRATE_ISOLATE to some
6278 * other migration type. 6280 * other migration type.
6279 * 6281 *
6280 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6282 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6281 * migrate the pages from an unaligned range (ie. pages that 6283 * migrate the pages from an unaligned range (ie. pages that
6282 * we are interested in). This will put all the pages in 6284 * we are interested in). This will put all the pages in
6283 * range back to page allocator as MIGRATE_ISOLATE. 6285 * range back to page allocator as MIGRATE_ISOLATE.
6284 * 6286 *
6285 * When this is done, we take the pages in range from page 6287 * When this is done, we take the pages in range from page
6286 * allocator removing them from the buddy system. This way 6288 * allocator removing them from the buddy system. This way
6287 * page allocator will never consider using them. 6289 * page allocator will never consider using them.
6288 * 6290 *
6289 * This lets us mark the pageblocks back as 6291 * This lets us mark the pageblocks back as
6290 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6292 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6291 * aligned range but not in the unaligned, original range are 6293 * aligned range but not in the unaligned, original range are
6292 * put back to page allocator so that buddy can use them. 6294 * put back to page allocator so that buddy can use them.
6293 */ 6295 */
6294 6296
6295 ret = start_isolate_page_range(pfn_max_align_down(start), 6297 ret = start_isolate_page_range(pfn_max_align_down(start),
6296 pfn_max_align_up(end), migratetype, 6298 pfn_max_align_up(end), migratetype,
6297 false); 6299 false);
6298 if (ret) 6300 if (ret)
6299 return ret; 6301 return ret;
6300 6302
6301 ret = __alloc_contig_migrate_range(&cc, start, end); 6303 ret = __alloc_contig_migrate_range(&cc, start, end);
6302 if (ret) 6304 if (ret)
6303 goto done; 6305 goto done;
6304 6306
6305 /* 6307 /*
6306 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6308 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6307 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6309 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6308 * more, all pages in [start, end) are free in page allocator. 6310 * more, all pages in [start, end) are free in page allocator.
6309 * What we are going to do is to allocate all pages from 6311 * What we are going to do is to allocate all pages from
6310 * [start, end) (that is remove them from page allocator). 6312 * [start, end) (that is remove them from page allocator).
6311 * 6313 *
6312 * The only problem is that pages at the beginning and at the 6314 * The only problem is that pages at the beginning and at the
6313 * end of interesting range may be not aligned with pages that 6315 * end of interesting range may be not aligned with pages that
6314 * page allocator holds, ie. they can be part of higher order 6316 * page allocator holds, ie. they can be part of higher order
6315 * pages. Because of this, we reserve the bigger range and 6317 * pages. Because of this, we reserve the bigger range and
6316 * once this is done free the pages we are not interested in. 6318 * once this is done free the pages we are not interested in.
6317 * 6319 *
6318 * We don't have to hold zone->lock here because the pages are 6320 * We don't have to hold zone->lock here because the pages are
6319 * isolated thus they won't get removed from buddy. 6321 * isolated thus they won't get removed from buddy.
6320 */ 6322 */
6321 6323
6322 lru_add_drain_all(); 6324 lru_add_drain_all();
6323 drain_all_pages(); 6325 drain_all_pages();
6324 6326
6325 order = 0; 6327 order = 0;
6326 outer_start = start; 6328 outer_start = start;
6327 while (!PageBuddy(pfn_to_page(outer_start))) { 6329 while (!PageBuddy(pfn_to_page(outer_start))) {
6328 if (++order >= MAX_ORDER) { 6330 if (++order >= MAX_ORDER) {
6329 ret = -EBUSY; 6331 ret = -EBUSY;
6330 goto done; 6332 goto done;
6331 } 6333 }
6332 outer_start &= ~0UL << order; 6334 outer_start &= ~0UL << order;
6333 } 6335 }
6334 6336
6335 /* Make sure the range is really isolated. */ 6337 /* Make sure the range is really isolated. */
6336 if (test_pages_isolated(outer_start, end, false)) { 6338 if (test_pages_isolated(outer_start, end, false)) {
6337 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6339 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6338 outer_start, end); 6340 outer_start, end);
6339 ret = -EBUSY; 6341 ret = -EBUSY;
6340 goto done; 6342 goto done;
6341 } 6343 }
6342 6344
6343 6345
6344 /* Grab isolated pages from freelists. */ 6346 /* Grab isolated pages from freelists. */
6345 outer_end = isolate_freepages_range(&cc, outer_start, end); 6347 outer_end = isolate_freepages_range(&cc, outer_start, end);
6346 if (!outer_end) { 6348 if (!outer_end) {
6347 ret = -EBUSY; 6349 ret = -EBUSY;
6348 goto done; 6350 goto done;
6349 } 6351 }
6350 6352
6351 /* Free head and tail (if any) */ 6353 /* Free head and tail (if any) */
6352 if (start != outer_start) 6354 if (start != outer_start)
6353 free_contig_range(outer_start, start - outer_start); 6355 free_contig_range(outer_start, start - outer_start);
6354 if (end != outer_end) 6356 if (end != outer_end)
6355 free_contig_range(end, outer_end - end); 6357 free_contig_range(end, outer_end - end);
6356 6358
6357 done: 6359 done:
6358 undo_isolate_page_range(pfn_max_align_down(start), 6360 undo_isolate_page_range(pfn_max_align_down(start),
6359 pfn_max_align_up(end), migratetype); 6361 pfn_max_align_up(end), migratetype);
6360 return ret; 6362 return ret;
6361 } 6363 }
6362 6364
6363 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6365 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6364 { 6366 {
6365 unsigned int count = 0; 6367 unsigned int count = 0;
6366 6368
6367 for (; nr_pages--; pfn++) { 6369 for (; nr_pages--; pfn++) {
6368 struct page *page = pfn_to_page(pfn); 6370 struct page *page = pfn_to_page(pfn);
6369 6371
6370 count += page_count(page) != 1; 6372 count += page_count(page) != 1;
6371 __free_page(page); 6373 __free_page(page);
6372 } 6374 }
6373 WARN(count != 0, "%d pages are still in use!\n", count); 6375 WARN(count != 0, "%d pages are still in use!\n", count);
6374 } 6376 }
6375 #endif 6377 #endif
6376 6378
6377 #ifdef CONFIG_MEMORY_HOTPLUG 6379 #ifdef CONFIG_MEMORY_HOTPLUG
6378 /* 6380 /*
6379 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6381 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6380 * page high values need to be recalulated. 6382 * page high values need to be recalulated.
6381 */ 6383 */
6382 void __meminit zone_pcp_update(struct zone *zone) 6384 void __meminit zone_pcp_update(struct zone *zone)
6383 { 6385 {
6384 unsigned cpu; 6386 unsigned cpu;
6385 mutex_lock(&pcp_batch_high_lock); 6387 mutex_lock(&pcp_batch_high_lock);
6386 for_each_possible_cpu(cpu) 6388 for_each_possible_cpu(cpu)
6387 pageset_set_high_and_batch(zone, 6389 pageset_set_high_and_batch(zone,
6388 per_cpu_ptr(zone->pageset, cpu)); 6390 per_cpu_ptr(zone->pageset, cpu));
6389 mutex_unlock(&pcp_batch_high_lock); 6391 mutex_unlock(&pcp_batch_high_lock);
6390 } 6392 }
6391 #endif 6393 #endif
6392 6394
6393 void zone_pcp_reset(struct zone *zone) 6395 void zone_pcp_reset(struct zone *zone)
6394 { 6396 {
6395 unsigned long flags; 6397 unsigned long flags;
6396 int cpu; 6398 int cpu;
6397 struct per_cpu_pageset *pset; 6399 struct per_cpu_pageset *pset;
6398 6400
6399 /* avoid races with drain_pages() */ 6401 /* avoid races with drain_pages() */
6400 local_irq_save(flags); 6402 local_irq_save(flags);
6401 if (zone->pageset != &boot_pageset) { 6403 if (zone->pageset != &boot_pageset) {
6402 for_each_online_cpu(cpu) { 6404 for_each_online_cpu(cpu) {
6403 pset = per_cpu_ptr(zone->pageset, cpu); 6405 pset = per_cpu_ptr(zone->pageset, cpu);
6404 drain_zonestat(zone, pset); 6406 drain_zonestat(zone, pset);
6405 } 6407 }
6406 free_percpu(zone->pageset); 6408 free_percpu(zone->pageset);
6407 zone->pageset = &boot_pageset; 6409 zone->pageset = &boot_pageset;
6408 } 6410 }
6409 local_irq_restore(flags); 6411 local_irq_restore(flags);
6410 } 6412 }
6411 6413
6412 #ifdef CONFIG_MEMORY_HOTREMOVE 6414 #ifdef CONFIG_MEMORY_HOTREMOVE
6413 /* 6415 /*
6414 * All pages in the range must be isolated before calling this. 6416 * All pages in the range must be isolated before calling this.
6415 */ 6417 */
6416 void 6418 void
6417 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6419 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6418 { 6420 {
6419 struct page *page; 6421 struct page *page;
6420 struct zone *zone; 6422 struct zone *zone;
6421 int order, i; 6423 int order, i;
6422 unsigned long pfn; 6424 unsigned long pfn;
6423 unsigned long flags; 6425 unsigned long flags;
6424 /* find the first valid pfn */ 6426 /* find the first valid pfn */
6425 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6427 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6426 if (pfn_valid(pfn)) 6428 if (pfn_valid(pfn))
6427 break; 6429 break;
6428 if (pfn == end_pfn) 6430 if (pfn == end_pfn)
6429 return; 6431 return;
6430 zone = page_zone(pfn_to_page(pfn)); 6432 zone = page_zone(pfn_to_page(pfn));
6431 spin_lock_irqsave(&zone->lock, flags); 6433 spin_lock_irqsave(&zone->lock, flags);
6432 pfn = start_pfn; 6434 pfn = start_pfn;
6433 while (pfn < end_pfn) { 6435 while (pfn < end_pfn) {
6434 if (!pfn_valid(pfn)) { 6436 if (!pfn_valid(pfn)) {
6435 pfn++; 6437 pfn++;
6436 continue; 6438 continue;
6437 } 6439 }
6438 page = pfn_to_page(pfn); 6440 page = pfn_to_page(pfn);
6439 /* 6441 /*
6440 * The HWPoisoned page may be not in buddy system, and 6442 * The HWPoisoned page may be not in buddy system, and
6441 * page_count() is not 0. 6443 * page_count() is not 0.
6442 */ 6444 */
6443 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6445 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6444 pfn++; 6446 pfn++;
6445 SetPageReserved(page); 6447 SetPageReserved(page);
6446 continue; 6448 continue;
6447 } 6449 }
6448 6450
6449 BUG_ON(page_count(page)); 6451 BUG_ON(page_count(page));
6450 BUG_ON(!PageBuddy(page)); 6452 BUG_ON(!PageBuddy(page));
6451 order = page_order(page); 6453 order = page_order(page);
6452 #ifdef CONFIG_DEBUG_VM 6454 #ifdef CONFIG_DEBUG_VM
6453 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6455 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6454 pfn, 1 << order, end_pfn); 6456 pfn, 1 << order, end_pfn);
6455 #endif 6457 #endif
6456 list_del(&page->lru); 6458 list_del(&page->lru);
6457 rmv_page_order(page); 6459 rmv_page_order(page);
6458 zone->free_area[order].nr_free--; 6460 zone->free_area[order].nr_free--;
6459 for (i = 0; i < (1 << order); i++) 6461 for (i = 0; i < (1 << order); i++)
6460 SetPageReserved((page+i)); 6462 SetPageReserved((page+i));
6461 pfn += (1 << order); 6463 pfn += (1 << order);
6462 } 6464 }
6463 spin_unlock_irqrestore(&zone->lock, flags); 6465 spin_unlock_irqrestore(&zone->lock, flags);
6464 } 6466 }
6465 #endif 6467 #endif
6466 6468
6467 #ifdef CONFIG_MEMORY_FAILURE 6469 #ifdef CONFIG_MEMORY_FAILURE
6468 bool is_free_buddy_page(struct page *page) 6470 bool is_free_buddy_page(struct page *page)
6469 { 6471 {
6470 struct zone *zone = page_zone(page); 6472 struct zone *zone = page_zone(page);
6471 unsigned long pfn = page_to_pfn(page); 6473 unsigned long pfn = page_to_pfn(page);
6472 unsigned long flags; 6474 unsigned long flags;
6473 int order; 6475 int order;
6474 6476
6475 spin_lock_irqsave(&zone->lock, flags); 6477 spin_lock_irqsave(&zone->lock, flags);
6476 for (order = 0; order < MAX_ORDER; order++) { 6478 for (order = 0; order < MAX_ORDER; order++) {
6477 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6479 struct page *page_head = page - (pfn & ((1 << order) - 1));
6478 6480
6479 if (PageBuddy(page_head) && page_order(page_head) >= order) 6481 if (PageBuddy(page_head) && page_order(page_head) >= order)
6480 break; 6482 break;
6481 } 6483 }
6482 spin_unlock_irqrestore(&zone->lock, flags); 6484 spin_unlock_irqrestore(&zone->lock, flags);
6483 6485
6484 return order < MAX_ORDER; 6486 return order < MAX_ORDER;
6485 } 6487 }
6486 #endif 6488 #endif
6487 6489
6488 static const struct trace_print_flags pageflag_names[] = { 6490 static const struct trace_print_flags pageflag_names[] = {
6489 {1UL << PG_locked, "locked" }, 6491 {1UL << PG_locked, "locked" },
6490 {1UL << PG_error, "error" }, 6492 {1UL << PG_error, "error" },
6491 {1UL << PG_referenced, "referenced" }, 6493 {1UL << PG_referenced, "referenced" },
6492 {1UL << PG_uptodate, "uptodate" }, 6494 {1UL << PG_uptodate, "uptodate" },
6493 {1UL << PG_dirty, "dirty" }, 6495 {1UL << PG_dirty, "dirty" },
6494 {1UL << PG_lru, "lru" }, 6496 {1UL << PG_lru, "lru" },
6495 {1UL << PG_active, "active" }, 6497 {1UL << PG_active, "active" },
6496 {1UL << PG_slab, "slab" }, 6498 {1UL << PG_slab, "slab" },
6497 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6499 {1UL << PG_owner_priv_1, "owner_priv_1" },
6498 {1UL << PG_arch_1, "arch_1" }, 6500 {1UL << PG_arch_1, "arch_1" },
6499 {1UL << PG_reserved, "reserved" }, 6501 {1UL << PG_reserved, "reserved" },
6500 {1UL << PG_private, "private" }, 6502 {1UL << PG_private, "private" },
6501 {1UL << PG_private_2, "private_2" }, 6503 {1UL << PG_private_2, "private_2" },
6502 {1UL << PG_writeback, "writeback" }, 6504 {1UL << PG_writeback, "writeback" },
6503 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6505 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6504 {1UL << PG_head, "head" }, 6506 {1UL << PG_head, "head" },
6505 {1UL << PG_tail, "tail" }, 6507 {1UL << PG_tail, "tail" },
6506 #else 6508 #else
6507 {1UL << PG_compound, "compound" }, 6509 {1UL << PG_compound, "compound" },
6508 #endif 6510 #endif
6509 {1UL << PG_swapcache, "swapcache" }, 6511 {1UL << PG_swapcache, "swapcache" },
6510 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6512 {1UL << PG_mappedtodisk, "mappedtodisk" },
6511 {1UL << PG_reclaim, "reclaim" }, 6513 {1UL << PG_reclaim, "reclaim" },
6512 {1UL << PG_swapbacked, "swapbacked" }, 6514 {1UL << PG_swapbacked, "swapbacked" },
6513 {1UL << PG_unevictable, "unevictable" }, 6515 {1UL << PG_unevictable, "unevictable" },
6514 #ifdef CONFIG_MMU 6516 #ifdef CONFIG_MMU
6515 {1UL << PG_mlocked, "mlocked" }, 6517 {1UL << PG_mlocked, "mlocked" },
6516 #endif 6518 #endif
6517 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6519 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6518 {1UL << PG_uncached, "uncached" }, 6520 {1UL << PG_uncached, "uncached" },
6519 #endif 6521 #endif
6520 #ifdef CONFIG_MEMORY_FAILURE 6522 #ifdef CONFIG_MEMORY_FAILURE
6521 {1UL << PG_hwpoison, "hwpoison" }, 6523 {1UL << PG_hwpoison, "hwpoison" },
6522 #endif 6524 #endif
6523 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6525 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6524 {1UL << PG_compound_lock, "compound_lock" }, 6526 {1UL << PG_compound_lock, "compound_lock" },
6525 #endif 6527 #endif
6526 }; 6528 };
6527 6529
6528 static void dump_page_flags(unsigned long flags) 6530 static void dump_page_flags(unsigned long flags)
6529 { 6531 {
6530 const char *delim = ""; 6532 const char *delim = "";
6531 unsigned long mask; 6533 unsigned long mask;
6532 int i; 6534 int i;
6533 6535
6534 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6536 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6535 6537
6536 printk(KERN_ALERT "page flags: %#lx(", flags); 6538 printk(KERN_ALERT "page flags: %#lx(", flags);
6537 6539
6538 /* remove zone id */ 6540 /* remove zone id */
6539 flags &= (1UL << NR_PAGEFLAGS) - 1; 6541 flags &= (1UL << NR_PAGEFLAGS) - 1;
6540 6542
6541 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6543 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6542 6544
6543 mask = pageflag_names[i].mask; 6545 mask = pageflag_names[i].mask;
6544 if ((flags & mask) != mask) 6546 if ((flags & mask) != mask)
6545 continue; 6547 continue;
6546 6548
6547 flags &= ~mask; 6549 flags &= ~mask;
6548 printk("%s%s", delim, pageflag_names[i].name); 6550 printk("%s%s", delim, pageflag_names[i].name);
6549 delim = "|"; 6551 delim = "|";
6550 } 6552 }
6551 6553
6552 /* check for left over flags */ 6554 /* check for left over flags */
6553 if (flags) 6555 if (flags)
6554 printk("%s%#lx", delim, flags); 6556 printk("%s%#lx", delim, flags);
6555 6557
6556 printk(")\n"); 6558 printk(")\n");
6557 } 6559 }
6558 6560
6559 void dump_page(struct page *page) 6561 void dump_page(struct page *page)
6560 { 6562 {
6561 printk(KERN_ALERT 6563 printk(KERN_ALERT
6562 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6564 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6563 page, atomic_read(&page->_count), page_mapcount(page), 6565 page, atomic_read(&page->_count), page_mapcount(page),
6564 page->mapping, page->index); 6566 page->mapping, page->index);
6565 dump_page_flags(page->flags); 6567 dump_page_flags(page->flags);